From 1c9f2c7606afe149800986182638f636646dd824 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Jan 2024 08:28:16 -1000 Subject: [PATCH 01/28] kernfs: Rearrange kernfs_node fields to reduce its size on 64bit Moving .flags and .mode right below .hash makes kernfs_node smaller by 8 bytes on 64bit. To avoid creating a hole from 8 bytes alignment on 32bit archs, .priv is moved below so that there are two 32bit pointers after the 64bit .id field. v2: Updated to avoid size increase on 32bit noticed by Geert. Signed-off-by: Tejun Heo Cc: Geert Uytterhoeven Link: https://lore.kernel.org/r/ZZ7hwA18nfmFjYpj@slm.duckdns.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 99aaa050ccb7..82e1ce79a70c 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -206,22 +206,22 @@ struct kernfs_node { const void *ns; /* namespace tag */ unsigned int hash; /* ns + name hash */ + unsigned short flags; + umode_t mode; + union { struct kernfs_elem_dir dir; struct kernfs_elem_symlink symlink; struct kernfs_elem_attr attr; }; - void *priv; - /* * 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit, * the low 32bits are ino and upper generation. */ u64 id; - unsigned short flags; - umode_t mode; + void *priv; struct kernfs_iattrs *iattr; }; From 4207b556e62f0a8915afc5da4c5d5ad915a253a5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2024 11:48:04 -1000 Subject: [PATCH 02/28] kernfs: RCU protect kernfs_nodes and avoid kernfs_idr_lock in kernfs_find_and_get_node_by_id() The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id() which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF programs including e.g. the ones that attach to functions which are holding the scheduler rq lock. Consider the following BPF program: SEC("fentry/__set_cpus_allowed_ptr_locked") int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p, struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf) { struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id); if (cgrp) { bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name); bpf_cgroup_release(cgrp); } return 0; } __set_cpus_allowed_ptr_locked() is called with rq lock held and the above BPF program calls bpf_cgroup_from_id() within leading to the following lockdep warning: ===================================================== WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected 6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted ----------------------------------------------------- repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70 and this task is already holding: ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0 which would create a new lock dependency: (&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2} ... Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(kernfs_idr_lock); local_irq_disable(); lock(&rq->__lock); lock(kernfs_idr_lock); lock(&rq->__lock); *** DEADLOCK *** ... Call Trace: dump_stack_lvl+0x55/0x70 dump_stack+0x10/0x20 __lock_acquire+0x781/0x2a40 lock_acquire+0xbf/0x1f0 _raw_spin_lock+0x2f/0x40 kernfs_find_and_get_node_by_id+0x1e/0x70 cgroup_get_from_id+0x21/0x240 bpf_cgroup_from_id+0xe/0x20 bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a bpf_trampoline_6442545632+0x4f/0x1000 __set_cpus_allowed_ptr_locked+0x5/0x5a0 sched_setaffinity+0x1b3/0x290 __x64_sys_sched_setaffinity+0x4f/0x60 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e Let's fix it by protecting kernfs_node and kernfs_root with RCU and making kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of kernfs_idr_lock. This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit. Combined with the preceding rearrange patch, the net increase is 8 bytes. Signed-off-by: Tejun Heo Cc: Andrea Righi Cc: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240109214828.252092-4-tj@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 31 ++++++++++++++++++++----------- fs/kernfs/kernfs-internal.h | 2 ++ include/linux/kernfs.h | 2 ++ 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index bce1d7ac95ca..458519e416fe 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -529,6 +529,20 @@ void kernfs_get(struct kernfs_node *kn) } EXPORT_SYMBOL_GPL(kernfs_get); +static void kernfs_free_rcu(struct rcu_head *rcu) +{ + struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu); + + kfree_const(kn->name); + + if (kn->iattr) { + simple_xattrs_free(&kn->iattr->xattrs, NULL); + kmem_cache_free(kernfs_iattrs_cache, kn->iattr); + } + + kmem_cache_free(kernfs_node_cache, kn); +} + /** * kernfs_put - put a reference count on a kernfs_node * @kn: the target kernfs_node @@ -557,16 +571,11 @@ void kernfs_put(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); - kfree_const(kn->name); - - if (kn->iattr) { - simple_xattrs_free(&kn->iattr->xattrs, NULL); - kmem_cache_free(kernfs_iattrs_cache, kn->iattr); - } spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); - kmem_cache_free(kernfs_node_cache, kn); + + call_rcu(&kn->rcu, kernfs_free_rcu); kn = parent; if (kn) { @@ -575,7 +584,7 @@ void kernfs_put(struct kernfs_node *kn) } else { /* just released the root kn, free @root too */ idr_destroy(&root->ino_idr); - kfree(root); + kfree_rcu(root, rcu); } } EXPORT_SYMBOL_GPL(kernfs_put); @@ -715,7 +724,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, ino_t ino = kernfs_id_ino(id); u32 gen = kernfs_id_gen(id); - spin_lock(&kernfs_idr_lock); + rcu_read_lock(); kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) @@ -739,10 +748,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; - spin_unlock(&kernfs_idr_lock); + rcu_read_unlock(); return kn; err_unlock: - spin_unlock(&kernfs_idr_lock); + rcu_read_unlock(); return NULL; } diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 237f2764b941..b42ee6547cdc 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -49,6 +49,8 @@ struct kernfs_root { struct rw_semaphore kernfs_rwsem; struct rw_semaphore kernfs_iattr_rwsem; struct rw_semaphore kernfs_supers_rwsem; + + struct rcu_head rcu; }; /* +1 to avoid triggering overflow warning when negating it */ diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 82e1ce79a70c..87c79d076d6d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -223,6 +223,8 @@ struct kernfs_node { void *priv; struct kernfs_iattrs *iattr; + + struct rcu_head rcu; }; /* From 05d8f255867e3196565bb31a911a437697fab094 Mon Sep 17 00:00:00 2001 From: Neel Natu Date: Sat, 27 Jan 2024 15:46:36 -0800 Subject: [PATCH 03/28] kernfs: fix false-positive WARN(nr_mmapped) in kernfs_drain_open_files Prior to this change 'on->nr_mmapped' tracked the total number of mmaps across all of its associated open files via kernfs_fop_mmap(). Thus if the file descriptor associated with a kernfs_open_file was mmapped 10 times then we would have: 'of->mmapped = true' and 'of_on(of)->nr_mmapped = 10'. The problem is that closing or draining a 'of->mmapped' file would only decrement one from the 'of_on(of)->nr_mmapped' counter. For e.g. we have this from kernfs_unlink_open_file(): if (of->mmapped) on->nr_mmapped--; The WARN_ON_ONCE(on->nr_mmapped) in kernfs_drain_open_files() is easy to reproduce by: 1. opening a (mmap-able) kernfs file. 2. mmap-ing that file more than once (mapping just once masks the issue). 3. trigger a drain of that kernfs file. Modulo out-of-tree patches I was able to trigger this reliably by identifying pci device nodes in sysfs that have resource regions that are mmap-able and that don't have any driver attached to them (steps 1 and 2). For step 3 we can "echo 1 > remove" to trigger a kernfs_drain. Signed-off-by: Neel Natu Link: https://lore.kernel.org/r/20240127234636.609265-1-neelnatu@google.com Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index ffa4565c275a..e9df2f87072c 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -483,9 +483,11 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) goto out_put; rc = 0; - of->mmapped = true; - of_on(of)->nr_mmapped++; - of->vm_ops = vma->vm_ops; + if (!of->mmapped) { + of->mmapped = true; + of_on(of)->nr_mmapped++; + of->vm_ops = vma->vm_ops; + } vma->vm_ops = &kernfs_vm_ops; out_put: kernfs_put_active(of->kn); From 3a480d4bb5b1e1f09426223e68acaa90da32e384 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 5 Jan 2024 11:26:48 +0100 Subject: [PATCH 04/28] driver core: cpu: make cpu_subsys const Now that the driver core can properly handle constant struct bus_type, move the cpu_subsys variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/2024010548-crane-snooze-a871@gregkh Signed-off-by: Greg Kroah-Hartman --- drivers/base/cpu.c | 2 +- include/linux/cpu.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 47de0f140ba6..ac84854c85d7 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -366,7 +366,7 @@ static int cpu_uevent(const struct device *dev, struct kobj_uevent_env *env) } #endif -struct bus_type cpu_subsys = { +const struct bus_type cpu_subsys = { .name = "cpu", .dev_name = "cpu", .match = cpu_subsys_match, diff --git a/include/linux/cpu.h b/include/linux/cpu.h index dcb89c987164..0b993a140946 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -128,7 +128,7 @@ static inline void cpu_maps_update_done(void) static inline int add_cpu(unsigned int cpu) { return 0;} #endif /* CONFIG_SMP */ -extern struct bus_type cpu_subsys; +extern const struct bus_type cpu_subsys; extern int lockdep_is_cpus_held(void); From f297a3844aa059c53be3f69be85ebc071b8a6d16 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 6 Jan 2024 21:57:50 -0800 Subject: [PATCH 05/28] driver core: component: fix spellos Correct spelling mistakes reported by codespell. Signed-off-by: Randy Dunlap Cc: "Rafael J. Wysocki" Cc: dri-devel@lists.freedesktop.org Link: https://lore.kernel.org/r/20240107055750.22441-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/component.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/component.c b/drivers/base/component.c index 7dbf14a1d915..741497324d78 100644 --- a/drivers/base/component.c +++ b/drivers/base/component.c @@ -751,7 +751,7 @@ static int __component_add(struct device *dev, const struct component_ops *ops, * component_bind_all(). See also &struct component_ops. * * @subcomponent must be nonzero and is used to differentiate between multiple - * components registerd on the same device @dev. These components are match + * components registered on the same device @dev. These components are match * using component_match_add_typed(). * * The component needs to be unregistered at driver unload/disconnect by @@ -781,7 +781,7 @@ EXPORT_SYMBOL_GPL(component_add_typed); * The component needs to be unregistered at driver unload/disconnect by * calling component_del(). * - * See also component_add_typed() for a variant that allows multipled different + * See also component_add_typed() for a variant that allows multiple different * components on the same device. */ int component_add(struct device *dev, const struct component_ops *ops) From 5df9197edd731a44682444631d68c2384f381bf1 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 6 Feb 2024 15:05:06 -0300 Subject: [PATCH 06/28] workqueue: make wq_subsys const Now that the driver core can properly handle constant struct bus_type, move the wq_subsys variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Suggested-by: Greg Kroah-Hartman Signed-off-by: "Ricardo B. Marliere" Cc: Tejun Heo Link: https://lore.kernel.org/r/20240206-bus_cleanup-workqueue-v1-1-72b10d282d58@marliere.net Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 76e60faed892..f1e062acd091 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6157,7 +6157,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = { __ATTR_NULL, }; -static struct bus_type wq_subsys = { +static const struct bus_type wq_subsys = { .name = "workqueue", .dev_groups = wq_sysfs_groups, }; From 2444a80c1cc2c4240f60f2162abef3797c1803de Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Feb 2024 08:48:28 +0000 Subject: [PATCH 07/28] kobject: make uevent_seqnum atomic We will soon no longer acquire uevent_sock_mutex for most kobject_uevent_net_broadcast() calls, and also while calling uevent_net_broadcast(). Make uevent_seqnum an atomic64_t to get its own protection. This fixes a race while reading /sys/kernel/uevent_seqnum. Signed-off-by: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Christian Brauner Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20240214084829.684541-2-edumazet@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 2 +- kernel/ksysfs.c | 2 +- lib/kobject_uevent.c | 17 +++++++++-------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/include/linux/kobject.h b/include/linux/kobject.h index c30affcc43b4..c8219505a79f 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -38,7 +38,7 @@ extern char uevent_helper[]; #endif /* counter to tag the uevent, read only except for the kobject core */ -extern u64 uevent_seqnum; +extern atomic64_t uevent_seqnum; /* * The actions here must match the index to the string array diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 1d4bc493b2f4..32ae7fa74a9c 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -39,7 +39,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sysfs_emit(buf, "%llu\n", (unsigned long long)uevent_seqnum); + return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&uevent_seqnum)); } KERNEL_ATTR_RO(uevent_seqnum); diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index fb9a2f06dd1e..9cb1a7fdaeba 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -30,7 +30,7 @@ #include -u64 uevent_seqnum; +atomic64_t uevent_seqnum; #ifdef CONFIG_UEVENT_HELPER char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH; #endif @@ -44,7 +44,7 @@ struct uevent_sock { static LIST_HEAD(uevent_sock_list); #endif -/* This lock protects uevent_seqnum and uevent_sock_list */ +/* This lock protects uevent_sock_list */ static DEFINE_MUTEX(uevent_sock_mutex); /* the strings here must match the enum in include/linux/kobject.h */ @@ -583,13 +583,13 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, break; } - mutex_lock(&uevent_sock_mutex); /* we will send an event, so request a new sequence number */ - retval = add_uevent_var(env, "SEQNUM=%llu", ++uevent_seqnum); - if (retval) { - mutex_unlock(&uevent_sock_mutex); + retval = add_uevent_var(env, "SEQNUM=%llu", + atomic64_inc_return(&uevent_seqnum)); + if (retval) goto exit; - } + + mutex_lock(&uevent_sock_mutex); retval = kobject_uevent_net_broadcast(kobj, env, action_string, devpath); mutex_unlock(&uevent_sock_mutex); @@ -688,7 +688,8 @@ static int uevent_net_broadcast(struct sock *usk, struct sk_buff *skb, int ret; /* bump and prepare sequence number */ - ret = snprintf(buf, sizeof(buf), "SEQNUM=%llu", ++uevent_seqnum); + ret = snprintf(buf, sizeof(buf), "SEQNUM=%llu", + atomic64_inc_return(&uevent_seqnum)); if (ret < 0 || (size_t)ret >= sizeof(buf)) return -ENOMEM; ret++; From 5c0941c55e5f681ffb05f395222ac673460bb3d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Feb 2024 08:48:29 +0000 Subject: [PATCH 08/28] kobject: reduce uevent_sock_mutex scope This is a followup of commit a3498436b3a0 ("netns: restrict uevents") - uevent_sock_mutex no longer protects uevent_seqnum thanks to prior patch in the series. - uevent_net_broadcast() can run without holding uevent_sock_mutex. - Instead of grabbing uevent_sock_mutex before calling kobject_uevent_net_broadcast(), we can move the mutex_lock(&uevent_sock_mutex) to the place we iterate over uevent_sock_list : uevent_net_broadcast_untagged(). After this patch, typical netdevice creations and destructions calling uevent_net_broadcast_tagged() no longer need to acquire uevent_sock_mutex. Signed-off-by: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Christian Brauner Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20240214084829.684541-3-edumazet@google.com Signed-off-by: Greg Kroah-Hartman --- lib/kobject_uevent.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 9cb1a7fdaeba..03b427e2707e 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -42,10 +42,9 @@ struct uevent_sock { #ifdef CONFIG_NET static LIST_HEAD(uevent_sock_list); -#endif - /* This lock protects uevent_sock_list */ static DEFINE_MUTEX(uevent_sock_mutex); +#endif /* the strings here must match the enum in include/linux/kobject.h */ static const char *kobject_actions[] = { @@ -315,6 +314,7 @@ static int uevent_net_broadcast_untagged(struct kobj_uevent_env *env, int retval = 0; /* send netlink message */ + mutex_lock(&uevent_sock_mutex); list_for_each_entry(ue_sk, &uevent_sock_list, list) { struct sock *uevent_sock = ue_sk->sk; @@ -334,6 +334,7 @@ static int uevent_net_broadcast_untagged(struct kobj_uevent_env *env, if (retval == -ENOBUFS || retval == -ESRCH) retval = 0; } + mutex_unlock(&uevent_sock_mutex); consume_skb(skb); return retval; @@ -589,10 +590,8 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, if (retval) goto exit; - mutex_lock(&uevent_sock_mutex); retval = kobject_uevent_net_broadcast(kobj, env, action_string, devpath); - mutex_unlock(&uevent_sock_mutex); #ifdef CONFIG_UEVENT_HELPER /* call uevent_helper, usually only enabled during early boot */ @@ -743,9 +742,7 @@ static int uevent_net_rcv_skb(struct sk_buff *skb, struct nlmsghdr *nlh, return -EPERM; } - mutex_lock(&uevent_sock_mutex); ret = uevent_net_broadcast(net->uevent_sock->sk, skb, extack); - mutex_unlock(&uevent_sock_mutex); return ret; } From d87c295f599cab2ab3b3df53a9098adba4a6002b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 30 Jan 2024 10:46:27 -0800 Subject: [PATCH 09/28] sysfs: Introduce a mechanism to hide static attribute_groups Add a mechanism for named attribute_groups to hide their directory at sysfs_update_group() time, or otherwise skip emitting the group directory when the group is first registered. It piggybacks on is_visible() in a similar manner as SYSFS_PREALLOC, i.e. special flags in the upper bits of the returned mode. To use it, specify a symbol prefix to DEFINE_SYSFS_GROUP_VISIBLE(), and then pass that same prefix to SYSFS_GROUP_VISIBLE() when assigning the @is_visible() callback: DEFINE_SYSFS_GROUP_VISIBLE($prefix) struct attribute_group $prefix_group = { .name = $name, .is_visible = SYSFS_GROUP_VISIBLE($prefix), }; SYSFS_GROUP_VISIBLE() expects a definition of $prefix_group_visible() and $prefix_attr_visible(), where $prefix_group_visible() just returns true / false and $prefix_attr_visible() behaves as normal. The motivation for this capability is to centralize PCI device authentication in the PCI core with a named sysfs group while keeping that group hidden for devices and platforms that do not meet the requirements. In a PCI topology, most devices will not support authentication, a small subset will support just PCI CMA (Component Measurement and Authentication), a smaller subset will support PCI CMA + PCIe IDE (Link Integrity and Encryption), and only next generation server hosts will start to include a platform TSM (TEE Security Manager). Without this capability the alternatives are: * Check if all attributes are invisible and if so, hide the directory. Beyond trouble getting this to work [1], this is an ABI change for scenarios if userspace happens to depend on group visibility absent any attributes. I.e. this new capability avoids regression since it does not retroactively apply to existing cases. * Publish an empty /sys/bus/pci/devices/$pdev/tsm/ directory for all PCI devices (i.e. for the case when TSM platform support is present, but device support is absent). Unfortunate that this will be a vestigial empty directory in the vast majority of cases. * Reintroduce usage of runtime calls to sysfs_{create,remove}_group() in the PCI core. Bjorn has already indicated that he does not want to see any growth of pci_sysfs_init() [2]. * Drop the named group and simulate a directory by prefixing all TSM-related attributes with "tsm_". Unfortunate to not use the naming capability of a sysfs group as intended. In comparison, there is a small potential for regression if for some reason an @is_visible() callback had dependencies on how many times it was called. Additionally, it is no longer an error to update a group that does not have its directory already present, and it is no longer a WARN() to remove a group that was never visible. Link: https://lore.kernel.org/all/2024012321-envious-procedure-4a58@gregkh/ [1] Link: https://lore.kernel.org/linux-pci/20231019200110.GA1410324@bhelgaas/ [2] Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/2024013028-deflator-flaring-ec62@gregkh Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/group.c | 45 ++++++++++++++++++++++++------- include/linux/sysfs.h | 63 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 87 insertions(+), 21 deletions(-) diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 138676463336..ccb275cdabcb 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -31,6 +31,17 @@ static void remove_files(struct kernfs_node *parent, kernfs_remove_by_name(parent, (*bin_attr)->attr.name); } +static umode_t __first_visible(const struct attribute_group *grp, struct kobject *kobj) +{ + if (grp->attrs && grp->is_visible) + return grp->is_visible(kobj, grp->attrs[0], 0); + + if (grp->bin_attrs && grp->is_bin_visible) + return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0); + + return 0; +} + static int create_files(struct kernfs_node *parent, struct kobject *kobj, kuid_t uid, kgid_t gid, const struct attribute_group *grp, int update) @@ -52,6 +63,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, kernfs_remove_by_name(parent, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); + mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; } @@ -81,6 +93,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, (*bin_attr)->attr.name); if (grp->is_bin_visible) { mode = grp->is_bin_visible(kobj, *bin_attr, i); + mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; } @@ -127,16 +140,31 @@ static int internal_create_group(struct kobject *kobj, int update, kobject_get_ownership(kobj, &uid, &gid); if (grp->name) { + umode_t mode = __first_visible(grp, kobj); + + if (mode & SYSFS_GROUP_INVISIBLE) + mode = 0; + else + mode = S_IRWXU | S_IRUGO | S_IXUGO; + if (update) { kn = kernfs_find_and_get(kobj->sd, grp->name); if (!kn) { - pr_warn("Can't update unknown attr grp name: %s/%s\n", - kobj->name, grp->name); - return -EINVAL; + pr_debug("attr grp %s/%s not created yet\n", + kobj->name, grp->name); + /* may have been invisible prior to this update */ + update = 0; + } else if (!mode) { + sysfs_remove_group(kobj, grp); + kernfs_put(kn); + return 0; } - } else { - kn = kernfs_create_dir_ns(kobj->sd, grp->name, - S_IRWXU | S_IRUGO | S_IXUGO, + } + + if (!update) { + if (!mode) + return 0; + kn = kernfs_create_dir_ns(kobj->sd, grp->name, mode, uid, gid, kobj, NULL); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) @@ -279,9 +307,8 @@ void sysfs_remove_group(struct kobject *kobj, if (grp->name) { kn = kernfs_find_and_get(parent, grp->name); if (!kn) { - WARN(!kn, KERN_WARNING - "sysfs group '%s' not found for kobject '%s'\n", - grp->name, kobject_name(kobj)); + pr_debug("sysfs group '%s' not found for kobject '%s'\n", + grp->name, kobject_name(kobj)); return; } } else { diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index b717a70219f6..a42642b277dd 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -61,22 +61,32 @@ do { \ /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name - * If specified, the attribute group will be created in - * a new subdirectory with this name. + * If specified, the attribute group will be created in a + * new subdirectory with this name. Additionally when a + * group is named, @is_visible and @is_bin_visible may + * return SYSFS_GROUP_INVISIBLE to control visibility of + * the directory itself. * @is_visible: Optional: Function to return permissions associated with an - * attribute of the group. Will be called repeatedly for each - * non-binary attribute in the group. Only read/write + * attribute of the group. Will be called repeatedly for + * each non-binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must - * return 0 if an attribute is not visible. The returned value - * will replace static permissions defined in struct attribute. + * return 0 if an attribute is not visible. The returned + * value will replace static permissions defined in struct + * attribute. Use SYSFS_GROUP_VISIBLE() when assigning this + * callback to specify separate _group_visible() and + * _attr_visible() handlers. * @is_bin_visible: * Optional: Function to return permissions associated with a * binary attribute of the group. Will be called repeatedly * for each binary attribute in the group. Only read/write - * permissions as well as SYSFS_PREALLOC are accepted. Must - * return 0 if a binary attribute is not visible. The returned - * value will replace static permissions defined in - * struct bin_attribute. + * permissions as well as SYSFS_PREALLOC (and the + * visibility flags for named groups) are accepted. Must + * return 0 if a binary attribute is not visible. The + * returned value will replace static permissions defined + * in struct bin_attribute. If @is_visible is not set, Use + * SYSFS_GROUP_VISIBLE() when assigning this callback to + * specify separate _group_visible() and _attr_visible() + * handlers. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. @@ -91,13 +101,42 @@ struct attribute_group { struct bin_attribute **bin_attrs; }; +#define SYSFS_PREALLOC 010000 +#define SYSFS_GROUP_INVISIBLE 020000 + +/* + * The first call to is_visible() in the create / update path may + * indicate visibility for the entire group + */ +#define DEFINE_SYSFS_GROUP_VISIBLE(name) \ + static inline umode_t sysfs_group_visible_##name( \ + struct kobject *kobj, struct attribute *attr, int n) \ + { \ + if (n == 0 && !name##_group_visible(kobj)) \ + return SYSFS_GROUP_INVISIBLE; \ + return name##_attr_visible(kobj, attr, n); \ + } + +/* + * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary + * attributes + */ +#define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name) \ + static inline umode_t sysfs_group_visible_##name( \ + struct kobject *kobj, struct bin_attribute *attr, int n) \ + { \ + if (n == 0 && !name##_group_visible(kobj)) \ + return SYSFS_GROUP_INVISIBLE; \ + return name##_attr_visible(kobj, attr, n); \ + } + +#define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn + /* * Use these macros to make defining attributes easier. * See include/linux/device.h for examples.. */ -#define SYSFS_PREALLOC 010000 - #define __ATTR(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ From 70317fd24b419091aa0a6dc3ea3ec7bb50c37c32 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 30 Jan 2024 10:46:27 -0800 Subject: [PATCH 10/28] sysfs: Introduce a mechanism to hide static attribute_groups Add a mechanism for named attribute_groups to hide their directory at sysfs_update_group() time, or otherwise skip emitting the group directory when the group is first registered. It piggybacks on is_visible() in a similar manner as SYSFS_PREALLOC, i.e. special flags in the upper bits of the returned mode. To use it, specify a symbol prefix to DEFINE_SYSFS_GROUP_VISIBLE(), and then pass that same prefix to SYSFS_GROUP_VISIBLE() when assigning the @is_visible() callback: DEFINE_SYSFS_GROUP_VISIBLE($prefix) struct attribute_group $prefix_group = { .name = $name, .is_visible = SYSFS_GROUP_VISIBLE($prefix), }; SYSFS_GROUP_VISIBLE() expects a definition of $prefix_group_visible() and $prefix_attr_visible(), where $prefix_group_visible() just returns true / false and $prefix_attr_visible() behaves as normal. The motivation for this capability is to centralize PCI device authentication in the PCI core with a named sysfs group while keeping that group hidden for devices and platforms that do not meet the requirements. In a PCI topology, most devices will not support authentication, a small subset will support just PCI CMA (Component Measurement and Authentication), a smaller subset will support PCI CMA + PCIe IDE (Link Integrity and Encryption), and only next generation server hosts will start to include a platform TSM (TEE Security Manager). Without this capability the alternatives are: * Check if all attributes are invisible and if so, hide the directory. Beyond trouble getting this to work [1], this is an ABI change for scenarios if userspace happens to depend on group visibility absent any attributes. I.e. this new capability avoids regression since it does not retroactively apply to existing cases. * Publish an empty /sys/bus/pci/devices/$pdev/tsm/ directory for all PCI devices (i.e. for the case when TSM platform support is present, but device support is absent). Unfortunate that this will be a vestigial empty directory in the vast majority of cases. * Reintroduce usage of runtime calls to sysfs_{create,remove}_group() in the PCI core. Bjorn has already indicated that he does not want to see any growth of pci_sysfs_init() [2]. * Drop the named group and simulate a directory by prefixing all TSM-related attributes with "tsm_". Unfortunate to not use the naming capability of a sysfs group as intended. In comparison, there is a small potential for regression if for some reason an @is_visible() callback had dependencies on how many times it was called. Additionally, it is no longer an error to update a group that does not have its directory already present, and it is no longer a WARN() to remove a group that was never visible. Link: https://lore.kernel.org/all/2024012321-envious-procedure-4a58@gregkh/ [1] Link: https://lore.kernel.org/linux-pci/20231019200110.GA1410324@bhelgaas/ [2] Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/2024013028-deflator-flaring-ec62@gregkh Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/group.c | 45 ++++++++++++++++++++++++------- include/linux/sysfs.h | 63 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 87 insertions(+), 21 deletions(-) diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 138676463336..ccb275cdabcb 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -31,6 +31,17 @@ static void remove_files(struct kernfs_node *parent, kernfs_remove_by_name(parent, (*bin_attr)->attr.name); } +static umode_t __first_visible(const struct attribute_group *grp, struct kobject *kobj) +{ + if (grp->attrs && grp->is_visible) + return grp->is_visible(kobj, grp->attrs[0], 0); + + if (grp->bin_attrs && grp->is_bin_visible) + return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0); + + return 0; +} + static int create_files(struct kernfs_node *parent, struct kobject *kobj, kuid_t uid, kgid_t gid, const struct attribute_group *grp, int update) @@ -52,6 +63,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, kernfs_remove_by_name(parent, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); + mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; } @@ -81,6 +93,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, (*bin_attr)->attr.name); if (grp->is_bin_visible) { mode = grp->is_bin_visible(kobj, *bin_attr, i); + mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; } @@ -127,16 +140,31 @@ static int internal_create_group(struct kobject *kobj, int update, kobject_get_ownership(kobj, &uid, &gid); if (grp->name) { + umode_t mode = __first_visible(grp, kobj); + + if (mode & SYSFS_GROUP_INVISIBLE) + mode = 0; + else + mode = S_IRWXU | S_IRUGO | S_IXUGO; + if (update) { kn = kernfs_find_and_get(kobj->sd, grp->name); if (!kn) { - pr_warn("Can't update unknown attr grp name: %s/%s\n", - kobj->name, grp->name); - return -EINVAL; + pr_debug("attr grp %s/%s not created yet\n", + kobj->name, grp->name); + /* may have been invisible prior to this update */ + update = 0; + } else if (!mode) { + sysfs_remove_group(kobj, grp); + kernfs_put(kn); + return 0; } - } else { - kn = kernfs_create_dir_ns(kobj->sd, grp->name, - S_IRWXU | S_IRUGO | S_IXUGO, + } + + if (!update) { + if (!mode) + return 0; + kn = kernfs_create_dir_ns(kobj->sd, grp->name, mode, uid, gid, kobj, NULL); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) @@ -279,9 +307,8 @@ void sysfs_remove_group(struct kobject *kobj, if (grp->name) { kn = kernfs_find_and_get(parent, grp->name); if (!kn) { - WARN(!kn, KERN_WARNING - "sysfs group '%s' not found for kobject '%s'\n", - grp->name, kobject_name(kobj)); + pr_debug("sysfs group '%s' not found for kobject '%s'\n", + grp->name, kobject_name(kobj)); return; } } else { diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index b717a70219f6..a42642b277dd 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -61,22 +61,32 @@ do { \ /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name - * If specified, the attribute group will be created in - * a new subdirectory with this name. + * If specified, the attribute group will be created in a + * new subdirectory with this name. Additionally when a + * group is named, @is_visible and @is_bin_visible may + * return SYSFS_GROUP_INVISIBLE to control visibility of + * the directory itself. * @is_visible: Optional: Function to return permissions associated with an - * attribute of the group. Will be called repeatedly for each - * non-binary attribute in the group. Only read/write + * attribute of the group. Will be called repeatedly for + * each non-binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must - * return 0 if an attribute is not visible. The returned value - * will replace static permissions defined in struct attribute. + * return 0 if an attribute is not visible. The returned + * value will replace static permissions defined in struct + * attribute. Use SYSFS_GROUP_VISIBLE() when assigning this + * callback to specify separate _group_visible() and + * _attr_visible() handlers. * @is_bin_visible: * Optional: Function to return permissions associated with a * binary attribute of the group. Will be called repeatedly * for each binary attribute in the group. Only read/write - * permissions as well as SYSFS_PREALLOC are accepted. Must - * return 0 if a binary attribute is not visible. The returned - * value will replace static permissions defined in - * struct bin_attribute. + * permissions as well as SYSFS_PREALLOC (and the + * visibility flags for named groups) are accepted. Must + * return 0 if a binary attribute is not visible. The + * returned value will replace static permissions defined + * in struct bin_attribute. If @is_visible is not set, Use + * SYSFS_GROUP_VISIBLE() when assigning this callback to + * specify separate _group_visible() and _attr_visible() + * handlers. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. @@ -91,13 +101,42 @@ struct attribute_group { struct bin_attribute **bin_attrs; }; +#define SYSFS_PREALLOC 010000 +#define SYSFS_GROUP_INVISIBLE 020000 + +/* + * The first call to is_visible() in the create / update path may + * indicate visibility for the entire group + */ +#define DEFINE_SYSFS_GROUP_VISIBLE(name) \ + static inline umode_t sysfs_group_visible_##name( \ + struct kobject *kobj, struct attribute *attr, int n) \ + { \ + if (n == 0 && !name##_group_visible(kobj)) \ + return SYSFS_GROUP_INVISIBLE; \ + return name##_attr_visible(kobj, attr, n); \ + } + +/* + * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary + * attributes + */ +#define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name) \ + static inline umode_t sysfs_group_visible_##name( \ + struct kobject *kobj, struct bin_attribute *attr, int n) \ + { \ + if (n == 0 && !name##_group_visible(kobj)) \ + return SYSFS_GROUP_INVISIBLE; \ + return name##_attr_visible(kobj, attr, n); \ + } + +#define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn + /* * Use these macros to make defining attributes easier. * See include/linux/device.h for examples.. */ -#define SYSFS_PREALLOC 010000 - #define __ATTR(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ From cd69fedf58f8ab1ab511f7c6ac1969cebf1c935f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 22 Feb 2024 12:40:54 -0800 Subject: [PATCH 11/28] sysfs: Fix crash on empty group attributes array It turns out that arch/x86/events/intel/core.c makes use of "empty" attributes. static struct attribute *empty_attrs; __init int intel_pmu_init(void) { struct attribute **extra_skl_attr = &empty_attrs; struct attribute **extra_attr = &empty_attrs; struct attribute **td_attr = &empty_attrs; struct attribute **mem_attr = &empty_attrs; struct attribute **tsx_attr = &empty_attrs; ... That breaks the assumption __first_visible() that expects that if grp->attrs is set then grp->attrs[0] must also be set and results in backtraces like: BUG: kernel NULL pointer dereference, address: 00rnel mode #PF: error_code(0x0000) - not-present ] PREEMPT SMP NOPTI CPU: 1 PID: 1 Comm: swapper/IP: 0010:exra_is_visible+0x14/0x20 ? exc_page_fault+0x68/0x190 internal_create_groups+0x42/0xa0 pmu_dev_alloc+0xc0/0xe0 perf_event_sysfs_init+0x580000000000 ]--- RIP: 0010:exra_is_visible+0x14/0 Check for non-empty attributes array before calling is_visible(). Reported-by: Pierre-Louis Bossart Closes: https://github.com/thesofproject/linux/pull/4799#issuecomment-1958537212 Fixes: 70317fd24b41 ("sysfs: Introduce a mechanism to hide static attribute_groups") Cc: Marc Herbert Cc: Rafael J. Wysocki Signed-off-by: Dan Williams Tested-by: Marc Herbert Link: https://lore.kernel.org/r/170863445442.1479840.1818801787239831650.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/group.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index ccb275cdabcb..8c63ba3cfc47 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -33,10 +33,10 @@ static void remove_files(struct kernfs_node *parent, static umode_t __first_visible(const struct attribute_group *grp, struct kobject *kobj) { - if (grp->attrs && grp->is_visible) + if (grp->attrs && grp->attrs[0] && grp->is_visible) return grp->is_visible(kobj, grp->attrs[0], 0); - if (grp->bin_attrs && grp->is_bin_visible) + if (grp->bin_attrs && grp->bin_attrs[0] && grp->is_bin_visible) return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0); return 0; From aa3c88990f77bb9acb3d445337bc088031ac63f9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 22 Feb 2024 12:41:00 -0800 Subject: [PATCH 12/28] sysfs: Document new "group visible" helpers Add documentation and examples for how to use DEFINE_SYSFS_GROUP_VISIBLE() and SYSFS_GROUP_VISIBLE(). Recall that the motivation for this work is that it is easier to reason about the lifetime of statically defined sysfs attributes that become visible at device_add() time rather than dynamically adding them later. DEFINE_SYSFS_GROUP_VISIBLE() tackles one of the reasons to opt for dynamically created attributes which did not have a facility for hiding empty directories. Signed-off-by: Dan Williams Link: https://lore.kernel.org/r/170863446065.1479840.10697164014098377292.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index a42642b277dd..dabf7f4f3581 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -105,8 +105,42 @@ struct attribute_group { #define SYSFS_GROUP_INVISIBLE 020000 /* - * The first call to is_visible() in the create / update path may - * indicate visibility for the entire group + * DEFINE_SYSFS_GROUP_VISIBLE(name): + * A helper macro to pair with the assignment of ".is_visible = + * SYSFS_GROUP_VISIBLE(name)", that arranges for the directory + * associated with a named attribute_group to optionally be hidden. + * This allows for static declaration of attribute_groups, and the + * simplification of attribute visibility lifetime that implies, + * without polluting sysfs with empty attribute directories. + * Ex. + * + * static umode_t example_attr_visible(struct kobject *kobj, + * struct attribute *attr, int n) + * { + * if (example_attr_condition) + * return 0; + * else if (ro_attr_condition) + * return 0444; + * return a->mode; + * } + * + * static bool example_group_visible(struct kobject *kobj) + * { + * if (example_group_condition) + * return false; + * return true; + * } + * + * DEFINE_SYSFS_GROUP_VISIBLE(example); + * + * static struct attribute_group example_group = { + * .name = "example", + * .is_visible = SYSFS_GROUP_VISIBLE(example), + * .attrs = &example_attrs, + * }; + * + * Note that it expects _attr_visible and _group_visible to + * be defined. */ #define DEFINE_SYSFS_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ @@ -119,7 +153,9 @@ struct attribute_group { /* * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary - * attributes + * attributes. If an attribute_group defines both text and binary + * attributes, the group visibility is determined by the function + * specified to is_visible() not is_bin_visible() */ #define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ From 04edfa7fa059ba50d3236b55ba0ae23b1721e868 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 22 Feb 2024 12:41:06 -0800 Subject: [PATCH 13/28] sysfs: Introduce DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE() One of the first users of DEFINE_SYSFS_GROUP_VISIBLE() did this: static umode_t dp0_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct sdw_slave *slave = dev_to_sdw_dev(kobj_to_dev(kobj)); if (slave->prop.dp0_prop) return attr->mode; return 0; } static bool dp0_group_visible(struct kobject *kobj) { struct sdw_slave *slave = dev_to_sdw_dev(kobj_to_dev(kobj)); if (slave->prop.dp0_prop) return true; return false; } DEFINE_SYSFS_GROUP_VISIBLE(dp0); ...i.e. the _group_visible() helper is identical to the _attr_visible() helper. Use the "simple" helper to reduce that to: static bool dp0_group_visible(struct kobject *kobj) { struct sdw_slave *slave = dev_to_sdw_dev(kobj_to_dev(kobj)); if (slave->prop.dp0_prop) return true; return false; } DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(dp0); Remove the need to specify per attribute visibility if the goal is to hide the entire group. Signed-off-by: Dan Williams Link: https://lore.kernel.org/r/170863446625.1479840.10593839479268727913.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 45 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index dabf7f4f3581..326341c62385 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -140,7 +140,9 @@ struct attribute_group { * }; * * Note that it expects _attr_visible and _group_visible to - * be defined. + * be defined. For cases where individual attributes do not need + * separate visibility consideration, only entire group visibility at + * once, see DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(). */ #define DEFINE_SYSFS_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ @@ -151,6 +153,38 @@ struct attribute_group { return name##_attr_visible(kobj, attr, n); \ } +/* + * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name): + * A helper macro to pair with SYSFS_GROUP_VISIBLE() that like + * DEFINE_SYSFS_GROUP_VISIBLE() controls group visibility, but does + * not require the implementation of a per-attribute visibility + * callback. + * Ex. + * + * static bool example_group_visible(struct kobject *kobj) + * { + * if (example_group_condition) + * return false; + * return true; + * } + * + * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(example); + * + * static struct attribute_group example_group = { + * .name = "example", + * .is_visible = SYSFS_GROUP_VISIBLE(example), + * .attrs = &example_attrs, + * }; + */ +#define DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name) \ + static inline umode_t sysfs_group_visible_##name( \ + struct kobject *kobj, struct attribute *a, int n) \ + { \ + if (n == 0 && !name##_group_visible(kobj)) \ + return SYSFS_GROUP_INVISIBLE; \ + return a->mode; \ + } + /* * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary * attributes. If an attribute_group defines both text and binary @@ -166,6 +200,15 @@ struct attribute_group { return name##_attr_visible(kobj, attr, n); \ } +#define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name) \ + static inline umode_t sysfs_group_visible_##name( \ + struct kobject *kobj, struct bin_attribute *a, int n) \ + { \ + if (n == 0 && !name##_group_visible(kobj)) \ + return SYSFS_GROUP_INVISIBLE; \ + return a->mode; \ + } + #define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn /* From 822d66c45e793240a9888463127059558bbe9c0d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 20 Jan 2024 07:58:46 +0100 Subject: [PATCH 14/28] platform-msi: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). Note that the upper limit of ida_simple_get() is exclusive, but the one of ida_alloc_max() is inclusive. So a -1 has been added when needed. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/fd87836efa894aee0ae43e767369c85a2ee7e1ff.1705733916.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform-msi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c index f37ad34c80ec..ca48d1f60865 100644 --- a/drivers/base/platform-msi.c +++ b/drivers/base/platform-msi.c @@ -172,8 +172,8 @@ static int platform_msi_alloc_priv_data(struct device *dev, unsigned int nvec, if (!datap) return -ENOMEM; - datap->devid = ida_simple_get(&platform_msi_devid_ida, - 0, 1 << DEV_ID_SHIFT, GFP_KERNEL); + datap->devid = ida_alloc_max(&platform_msi_devid_ida, + (1 << DEV_ID_SHIFT) - 1, GFP_KERNEL); if (datap->devid < 0) { err = datap->devid; kfree(datap); @@ -191,7 +191,7 @@ static void platform_msi_free_priv_data(struct device *dev) struct platform_msi_priv_data *data = dev->msi.data->platform_data; dev->msi.data->platform_data = NULL; - ida_simple_remove(&platform_msi_devid_ida, data->devid); + ida_free(&platform_msi_devid_ida, data->devid); kfree(data); } From 8dde8fa0cc3edce73c050b9882d06c1a575f6402 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 17 Jan 2024 00:33:07 -0800 Subject: [PATCH 15/28] firmware_loader: introduce __free() cleanup hanler Define cleanup handler using facilities from linux/cleanup.h to simplify error handling in code using firmware loader. This will allow writing code like this: int driver_update_firmware(...) { const struct firmware *fw_entry __free(firmware) = NULL; int error; ... error = request_firmware(&fw_entry, fw_name, dev); if (error) { dev_err(dev, "failed to request firmware %s: %d", fw_name, error); return error; } error = check_firmware_valid(fw_entry); if (error) return error; guard(mutex)(&instance->lock); error = use_firmware(instance, fw); if (error) return error; return 0; } Signed-off-by: Dmitry Torokhov Acked-by: Luis Chamberalin Link: https://lore.kernel.org/r/ZaeQw7VXhnirX4pQ@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/firmware.h b/include/linux/firmware.h index 0311858b46ce..f026f8926d79 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -4,6 +4,7 @@ #include #include +#include #include #define FW_ACTION_NOUEVENT 0 @@ -198,4 +199,6 @@ static inline void firmware_upload_unregister(struct fw_upload *fw_upload) int firmware_request_cache(struct device *device, const char *name); +DEFINE_FREE(firmware, struct firmware *, release_firmware(_T)) + #endif From a54c1d1b859a57a99d5cbdce37ac754cbdd9344a Mon Sep 17 00:00:00 2001 From: Rohan Kollambalath Date: Mon, 12 Feb 2024 08:36:34 +1000 Subject: [PATCH 16/28] sysfs:Addresses documentation in sysfs_merge_group and sysfs_unmerge_group. These functions take a struct attribute_group as an input which has an optional .name field. These functions rely on the .name field being populated and do not check if its null. They pass this name into other functions, eventually leading to a null pointer dereference. This change simply updates the documentation of the function to make this requirement clear. Signed-off-by: Rohan Kollambalath Link: https://lore.kernel.org/r/20240211223634.2103665-1-rohankollambalath@gmail.com Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/group.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 8c63ba3cfc47..d22ad67a0f32 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -345,13 +345,13 @@ void sysfs_remove_groups(struct kobject *kobj, EXPORT_SYMBOL_GPL(sysfs_remove_groups); /** - * sysfs_merge_group - merge files into a pre-existing attribute group. + * sysfs_merge_group - merge files into a pre-existing named attribute group. * @kobj: The kobject containing the group. * @grp: The files to create and the attribute group they belong to. * - * This function returns an error if the group doesn't exist or any of the - * files already exist in that group, in which case none of the new files - * are created. + * This function returns an error if the group doesn't exist, the .name field is + * NULL or any of the files already exist in that group, in which case none of + * the new files are created. */ int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) @@ -383,7 +383,7 @@ int sysfs_merge_group(struct kobject *kobj, EXPORT_SYMBOL_GPL(sysfs_merge_group); /** - * sysfs_unmerge_group - remove files from a pre-existing attribute group. + * sysfs_unmerge_group - remove files from a pre-existing named attribute group. * @kobj: The kobject containing the group. * @grp: The files to remove and the attribute group they belong to. */ From 1fe6e4f0b0c47e70735066e889f97c3c6e1e79b2 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Mon, 19 Feb 2024 22:09:54 +0530 Subject: [PATCH 17/28] firmware_loader: Suppress warning on FW_OPT_NO_WARN flag Some of the warnings are still being printed even if FW_OPT_NO_WARN is passed for some of the function e.g., firmware_request_nowarn(). Fix it by adding a check for FW_OPT_NO_WARN before printing the warning. Signed-off-by: Mukesh Ojha Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240219163954.7719-1-quic_mojha@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/firmware_loader/main.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index ea28102d421e..da8ca01d011c 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -551,12 +551,16 @@ fw_get_filesystem_firmware(struct device *device, struct fw_priv *fw_priv, file_size_ptr, READING_FIRMWARE); if (rc < 0) { - if (rc != -ENOENT) - dev_warn(device, "loading %s failed with error %d\n", - path, rc); - else - dev_dbg(device, "loading %s failed for no such file or directory.\n", - path); + if (!(fw_priv->opt_flags & FW_OPT_NO_WARN)) { + if (rc != -ENOENT) + dev_warn(device, + "loading %s failed with error %d\n", + path, rc); + else + dev_dbg(device, + "loading %s failed for no such file or directory.\n", + path); + } continue; } size = rc; From bbf6cfba49a117c502ec5df66d3ab3b485c113f8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Mar 2024 20:00:05 +0200 Subject: [PATCH 18/28] driver core: Drop unneeded 'extern' keyword in fwnode.h We do not use 'extern' keyword with functions. Remove the last one mistakenly added to fwnode.h. Reviewed-by: Sakari Ailus Acked-by: Saravana Kannan Acked-by: "Rafael J. Wysocki" Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240301180138.271590-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 2a72f55d26eb..2d23a14857c7 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -209,9 +209,9 @@ static inline void fwnode_dev_initialized(struct fwnode_handle *fwnode, fwnode->flags &= ~FWNODE_FLAG_INITIALIZED; } -extern bool fw_devlink_is_strict(void); int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup); void fwnode_links_purge(struct fwnode_handle *fwnode); void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode); +bool fw_devlink_is_strict(void); #endif From 1c4002aeab3c81afa8a00ae76b1ea38d066e9978 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Mar 2024 20:00:06 +0200 Subject: [PATCH 19/28] driver core: Move fw_devlink stuff to where it belongs A few APIs, i.e. fwnode_is_ancestor_of(), fwnode_get_next_parent_dev(), and get_dev_from_fwnode(), that belong specifically to the fw_devlink APIs, may be static, but they are not. Resolve this mess by moving them to the driver/base/core where the all users are being resided and make static. No functional changes intended. Reviewed-by: Sakari Ailus Acked-by: "Rafael J. Wysocki" Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240301180138.271590-3-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 58 ++++++++++++++++++++++++++++++++++++++++ drivers/base/property.c | 56 -------------------------------------- include/linux/fwnode.h | 1 - include/linux/property.h | 2 -- 4 files changed, 58 insertions(+), 59 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 9828da9b933c..35ccd8bb2c9b 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1871,6 +1871,7 @@ static void fw_devlink_unblock_consumers(struct device *dev) device_links_write_unlock(); } +#define get_dev_from_fwnode(fwnode) get_device((fwnode)->dev) static bool fwnode_init_without_drv(struct fwnode_handle *fwnode) { @@ -1901,6 +1902,63 @@ static bool fwnode_ancestor_init_without_drv(struct fwnode_handle *fwnode) return false; } +/** + * fwnode_is_ancestor_of - Test if @ancestor is ancestor of @child + * @ancestor: Firmware which is tested for being an ancestor + * @child: Firmware which is tested for being the child + * + * A node is considered an ancestor of itself too. + * + * Return: true if @ancestor is an ancestor of @child. Otherwise, returns false. + */ +static bool fwnode_is_ancestor_of(const struct fwnode_handle *ancestor, + const struct fwnode_handle *child) +{ + struct fwnode_handle *parent; + + if (IS_ERR_OR_NULL(ancestor)) + return false; + + if (child == ancestor) + return true; + + fwnode_for_each_parent_node(child, parent) { + if (parent == ancestor) { + fwnode_handle_put(parent); + return true; + } + } + return false; +} + +/** + * fwnode_get_next_parent_dev - Find device of closest ancestor fwnode + * @fwnode: firmware node + * + * Given a firmware node (@fwnode), this function finds its closest ancestor + * firmware node that has a corresponding struct device and returns that struct + * device. + * + * The caller is responsible for calling put_device() on the returned device + * pointer. + * + * Return: a pointer to the device of the @fwnode's closest ancestor. + */ +static struct device *fwnode_get_next_parent_dev(const struct fwnode_handle *fwnode) +{ + struct fwnode_handle *parent; + struct device *dev; + + fwnode_for_each_parent_node(fwnode, parent) { + dev = get_dev_from_fwnode(parent); + if (dev) { + fwnode_handle_put(parent); + return dev; + } + } + return NULL; +} + /** * __fw_devlink_relax_cycles - Relax and mark dependency cycles. * @con: Potential consumer device. diff --git a/drivers/base/property.c b/drivers/base/property.c index a1b01ab42052..afa1bf2b3c5a 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -699,34 +699,6 @@ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode) } EXPORT_SYMBOL_GPL(fwnode_get_next_parent); -/** - * fwnode_get_next_parent_dev - Find device of closest ancestor fwnode - * @fwnode: firmware node - * - * Given a firmware node (@fwnode), this function finds its closest ancestor - * firmware node that has a corresponding struct device and returns that struct - * device. - * - * The caller is responsible for calling put_device() on the returned device - * pointer. - * - * Return: a pointer to the device of the @fwnode's closest ancestor. - */ -struct device *fwnode_get_next_parent_dev(const struct fwnode_handle *fwnode) -{ - struct fwnode_handle *parent; - struct device *dev; - - fwnode_for_each_parent_node(fwnode, parent) { - dev = get_dev_from_fwnode(parent); - if (dev) { - fwnode_handle_put(parent); - return dev; - } - } - return NULL; -} - /** * fwnode_count_parents - Return the number of parents a node has * @fwnode: The node the parents of which are to be counted @@ -773,34 +745,6 @@ struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwnode, } EXPORT_SYMBOL_GPL(fwnode_get_nth_parent); -/** - * fwnode_is_ancestor_of - Test if @ancestor is ancestor of @child - * @ancestor: Firmware which is tested for being an ancestor - * @child: Firmware which is tested for being the child - * - * A node is considered an ancestor of itself too. - * - * Return: true if @ancestor is an ancestor of @child. Otherwise, returns false. - */ -bool fwnode_is_ancestor_of(const struct fwnode_handle *ancestor, const struct fwnode_handle *child) -{ - struct fwnode_handle *parent; - - if (IS_ERR_OR_NULL(ancestor)) - return false; - - if (child == ancestor) - return true; - - fwnode_for_each_parent_node(child, parent) { - if (parent == ancestor) { - fwnode_handle_put(parent); - return true; - } - } - return false; -} - /** * fwnode_get_next_child_node - Return the next child node handle for a node * @fwnode: Firmware node to find the next child node for. diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 2d23a14857c7..416cbe72f0c7 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -187,7 +187,6 @@ struct fwnode_operations { if (fwnode_has_op(fwnode, op)) \ (fwnode)->ops->op(fwnode, ## __VA_ARGS__); \ } while (false) -#define get_dev_from_fwnode(fwnode) get_device((fwnode)->dev) static inline void fwnode_init(struct fwnode_handle *fwnode, const struct fwnode_operations *ops) diff --git a/include/linux/property.h b/include/linux/property.h index e6516d0b7d52..284ff79ebf03 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -156,11 +156,9 @@ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode); for (parent = fwnode_get_parent(fwnode); parent; \ parent = fwnode_get_next_parent(parent)) -struct device *fwnode_get_next_parent_dev(const struct fwnode_handle *fwnode); unsigned int fwnode_count_parents(const struct fwnode_handle *fwn); struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwn, unsigned int depth); -bool fwnode_is_ancestor_of(const struct fwnode_handle *ancestor, const struct fwnode_handle *child); struct fwnode_handle *fwnode_get_next_child_node( const struct fwnode_handle *fwnode, struct fwnode_handle *child); struct fwnode_handle *fwnode_get_next_available_child_node( From 420b104dd116cddd1615588a400b557bf4e436b4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Mar 2024 20:00:07 +0200 Subject: [PATCH 20/28] device property: Move enum dev_dma_attr to fwnode.h The struct fwnode_operations defines one of the callback to return enum dev_dma_attr. But this currently is defined in property.h. Move it to the correct location. Reviewed-by: Sakari Ailus Acked-by: "Rafael J. Wysocki" Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240301180138.271590-4-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 6 ++++++ include/linux/property.h | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 416cbe72f0c7..4228c45d5ccc 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -14,6 +14,12 @@ #include #include +enum dev_dma_attr { + DEV_DMA_NOT_SUPPORTED, + DEV_DMA_NON_COHERENT, + DEV_DMA_COHERENT, +}; + struct fwnode_operations; struct device; diff --git a/include/linux/property.h b/include/linux/property.h index 284ff79ebf03..1f0135e24d00 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -27,12 +27,6 @@ enum dev_prop_type { DEV_PROP_REF, }; -enum dev_dma_attr { - DEV_DMA_NOT_SUPPORTED, - DEV_DMA_NON_COHERENT, - DEV_DMA_COHERENT, -}; - const struct fwnode_handle *__dev_fwnode_const(const struct device *dev); struct fwnode_handle *__dev_fwnode(struct device *dev); #define dev_fwnode(dev) \ From 4dc3d612ee5c3be2a4d1a73ab31bcfaaa850aa19 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Mar 2024 20:00:08 +0200 Subject: [PATCH 21/28] device property: Don't use "proxy" headers Update header inclusions to follow IWYU (Include What You Use) principle. Reviewed-by: Sakari Ailus Acked-by: "Rafael J. Wysocki" Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240301180138.271590-5-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/property.c | 11 ++++++----- drivers/base/swnode.c | 13 ++++++++++++- include/linux/fwnode.h | 4 ++-- include/linux/property.h | 1 + 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/base/property.c b/drivers/base/property.c index afa1bf2b3c5a..7324a704a9a1 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -7,15 +7,16 @@ * Mika Westerberg */ -#include +#include +#include #include -#include +#include #include -#include -#include -#include #include #include +#include +#include +#include struct fwnode_handle *__dev_fwnode(struct device *dev) { diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index 36512fb75a20..eb6eb25b343b 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -6,10 +6,21 @@ * Author: Heikki Krogerus */ +#include #include -#include +#include +#include +#include +#include +#include +#include +#include #include #include +#include +#include +#include +#include #include "base.h" diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 4228c45d5ccc..80f3cd91b471 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -9,10 +9,10 @@ #ifndef _LINUX_FWNODE_H_ #define _LINUX_FWNODE_H_ -#include -#include #include #include +#include +#include enum dev_dma_attr { DEV_DMA_NOT_SUPPORTED, diff --git a/include/linux/property.h b/include/linux/property.h index 1f0135e24d00..3a1045eb786c 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -11,6 +11,7 @@ #define _LINUX_PROPERTY_H_ #include +#include #include #include #include From 952c3fce297f12c7ff59380adb66b564e2bc9b64 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 29 Feb 2024 15:36:20 +0100 Subject: [PATCH 22/28] debugfs: fix wait/cancellation handling during remove Ben Greear further reports deadlocks during concurrent debugfs remove while files are being accessed, even though the code in question now uses debugfs cancellations. Turns out that despite all the review on the locking, we missed completely that the logic is wrong: if the refcount hits zero we can finish (and need not wait for the completion), but if it doesn't we have to trigger all the cancellations. As written, we can _never_ get into the loop triggering the cancellations. Fix this, and explain it better while at it. Cc: stable@vger.kernel.org Fixes: 8c88a474357e ("debugfs: add API to allow debugfs operations cancellation") Reported-by: Ben Greear Closes: https://lore.kernel.org/r/1c9fa9e5-09f1-0522-fdbc-dbcef4d255ca@candelatech.com Tested-by: Madhan Sai Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20240229153635.6bfab7eb34d3.I6c7aeff8c9d6628a8bc1ddcf332205a49d801f17@changeid Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 034a617cb1a5..a40da0065433 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -751,13 +751,28 @@ static void __debugfs_file_removed(struct dentry *dentry) if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) return; - /* if we hit zero, just wait for all to finish */ - if (!refcount_dec_and_test(&fsd->active_users)) { - wait_for_completion(&fsd->active_users_drained); + /* if this was the last reference, we're done */ + if (refcount_dec_and_test(&fsd->active_users)) return; - } - /* if we didn't hit zero, try to cancel any we can */ + /* + * If there's still a reference, the code that obtained it can + * be in different states: + * - The common case of not using cancellations, or already + * after debugfs_leave_cancellation(), where we just need + * to wait for debugfs_file_put() which signals the completion; + * - inside a cancellation section, i.e. between + * debugfs_enter_cancellation() and debugfs_leave_cancellation(), + * in which case we need to trigger the ->cancel() function, + * and then wait for debugfs_file_put() just like in the + * previous case; + * - before debugfs_enter_cancellation() (but obviously after + * debugfs_file_get()), in which case we may not see the + * cancellation in the list on the first round of the loop, + * but debugfs_enter_cancellation() signals the completion + * after adding it, so this code gets woken up to call the + * ->cancel() function. + */ while (refcount_read(&fsd->active_users)) { struct debugfs_cancellation *c; From 75cde56a5b504d07a64ce0e3f8c7410df70308a3 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Mon, 4 Mar 2024 21:04:54 -0800 Subject: [PATCH 23/28] driver core: Adds flags param to fwnode_link_add() Allow the callers to set fwnode link flags when adding fwnode links. Signed-off-by: Saravana Kannan Acked-by: "Rafael J. Wysocki" Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240305050458.1400667-2-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 5 +++-- drivers/firmware/efi/sysfb_efi.c | 2 +- drivers/of/property.c | 2 +- include/linux/fwnode.h | 3 ++- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 35ccd8bb2c9b..83a6f429bddb 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -92,12 +92,13 @@ static int __fwnode_link_add(struct fwnode_handle *con, return 0; } -int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup) +int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup, + u8 flags) { int ret; mutex_lock(&fwnode_link_lock); - ret = __fwnode_link_add(con, sup, 0); + ret = __fwnode_link_add(con, sup, flags); mutex_unlock(&fwnode_link_lock); return ret; } diff --git a/drivers/firmware/efi/sysfb_efi.c b/drivers/firmware/efi/sysfb_efi.c index 456d0e5eaf78..cc807ed35aed 100644 --- a/drivers/firmware/efi/sysfb_efi.c +++ b/drivers/firmware/efi/sysfb_efi.c @@ -336,7 +336,7 @@ static int efifb_add_links(struct fwnode_handle *fwnode) if (!sup_np) return 0; - fwnode_link_add(fwnode, of_fwnode_handle(sup_np)); + fwnode_link_add(fwnode, of_fwnode_handle(sup_np), 0); of_node_put(sup_np); return 0; diff --git a/drivers/of/property.c b/drivers/of/property.c index b71267c6667c..bce849f21ae2 100644 --- a/drivers/of/property.c +++ b/drivers/of/property.c @@ -1085,7 +1085,7 @@ static void of_link_to_phandle(struct device_node *con_np, tmp_np = of_get_next_parent(tmp_np); } - fwnode_link_add(of_fwnode_handle(con_np), of_fwnode_handle(sup_np)); + fwnode_link_add(of_fwnode_handle(con_np), of_fwnode_handle(sup_np), 0); } /** diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 80f3cd91b471..70d9c40269b9 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -214,7 +214,8 @@ static inline void fwnode_dev_initialized(struct fwnode_handle *fwnode, fwnode->flags &= ~FWNODE_FLAG_INITIALIZED; } -int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup); +int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup, + u8 flags); void fwnode_links_purge(struct fwnode_handle *fwnode); void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode); bool fw_devlink_is_strict(void); From b7e1241d8f77ed64404a5e4450f43a319310fc91 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Mon, 4 Mar 2024 21:04:55 -0800 Subject: [PATCH 24/28] driver core: Add FWLINK_FLAG_IGNORE to completely ignore a fwnode link A fwnode link between specific supplier-consumer fwnodes can be added multiple times for multiple reasons. If that dependency doesn't exist, deleting the fwnode link once doesn't guarantee that it won't get created again. So, add FWLINK_FLAG_IGNORE flag to mark a fwnode link as one that needs to be completely ignored. Since a fwnode link's flags is an OR of all the flags passed to all the fwnode_link_add() calls to create that specific fwnode link, the FWLINK_FLAG_IGNORE flag is preserved and can be used to mark a fwnode link as on that need to be completely ignored until it is deleted. Signed-off-by: Saravana Kannan Acked-by: "Rafael J. Wysocki" Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240305050458.1400667-3-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 9 ++++++++- include/linux/fwnode.h | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 83a6f429bddb..b93f3c5716ae 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1012,7 +1012,8 @@ static struct fwnode_handle *fwnode_links_check_suppliers( return NULL; list_for_each_entry(link, &fwnode->suppliers, c_hook) - if (!(link->flags & FWLINK_FLAG_CYCLE)) + if (!(link->flags & + (FWLINK_FLAG_CYCLE | FWLINK_FLAG_IGNORE))) return link->supplier; return NULL; @@ -2021,6 +2022,9 @@ static bool __fw_devlink_relax_cycles(struct device *con, } list_for_each_entry(link, &sup_handle->suppliers, c_hook) { + if (link->flags & FWLINK_FLAG_IGNORE) + continue; + if (__fw_devlink_relax_cycles(con, link->supplier)) { __fwnode_link_cycle(link); ret = true; @@ -2099,6 +2103,9 @@ static int fw_devlink_create_devlink(struct device *con, int ret = 0; u32 flags; + if (link->flags & FWLINK_FLAG_IGNORE) + return 0; + if (con->fwnode == link->consumer) flags = fw_devlink_get_flags(link->flags); else diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 70d9c40269b9..0d79070c5a70 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -59,8 +59,10 @@ struct fwnode_handle { * fwnode link flags * * CYCLE: The fwnode link is part of a cycle. Don't defer probe. + * IGNORE: Completely ignore this link, even during cycle detection. */ #define FWLINK_FLAG_CYCLE BIT(0) +#define FWLINK_FLAG_IGNORE BIT(1) struct fwnode_link { struct fwnode_handle *supplier; From 135116f3d01402b610e00dd54f3b059a3faf35de Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Mon, 4 Mar 2024 21:04:56 -0800 Subject: [PATCH 25/28] of: property: fw_devlink: Add support for "post-init-providers" property Add support for this property so that dependency cycles can be broken and fw_devlink can do better probe/suspend/resume ordering between devices in a dependency cycle. Signed-off-by: Saravana Kannan Acked-by: "Rafael J. Wysocki" Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240305050458.1400667-4-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/of/property.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/of/property.c b/drivers/of/property.c index bce849f21ae2..b517a92dabca 100644 --- a/drivers/of/property.c +++ b/drivers/of/property.c @@ -1066,7 +1066,8 @@ of_fwnode_device_get_match_data(const struct fwnode_handle *fwnode, } static void of_link_to_phandle(struct device_node *con_np, - struct device_node *sup_np) + struct device_node *sup_np, + u8 flags) { struct device_node *tmp_np = of_node_get(sup_np); @@ -1085,7 +1086,7 @@ static void of_link_to_phandle(struct device_node *con_np, tmp_np = of_get_next_parent(tmp_np); } - fwnode_link_add(of_fwnode_handle(con_np), of_fwnode_handle(sup_np), 0); + fwnode_link_add(of_fwnode_handle(con_np), of_fwnode_handle(sup_np), flags); } /** @@ -1198,6 +1199,8 @@ static struct device_node *parse_##fname(struct device_node *np, \ * to a struct device, implement this ops so fw_devlink can use it * to find the true consumer. * @optional: Describes whether a supplier is mandatory or not + * @fwlink_flags: Optional fwnode link flags to use when creating a fwnode link + * for this property. * * Returns: * parse_prop() return values are @@ -1210,6 +1213,7 @@ struct supplier_bindings { const char *prop_name, int index); struct device_node *(*get_con_dev)(struct device_node *np); bool optional; + u8 fwlink_flags; }; DEFINE_SIMPLE_PROP(clocks, "clocks", "#clock-cells") @@ -1240,6 +1244,7 @@ DEFINE_SIMPLE_PROP(leds, "leds", NULL) DEFINE_SIMPLE_PROP(backlight, "backlight", NULL) DEFINE_SIMPLE_PROP(panel, "panel", NULL) DEFINE_SIMPLE_PROP(msi_parent, "msi-parent", "#msi-cells") +DEFINE_SIMPLE_PROP(post_init_providers, "post-init-providers", NULL) DEFINE_SUFFIX_PROP(regulators, "-supply", NULL) DEFINE_SUFFIX_PROP(gpio, "-gpio", "#gpio-cells") @@ -1349,6 +1354,10 @@ static const struct supplier_bindings of_supplier_bindings[] = { { .parse_prop = parse_regulators, }, { .parse_prop = parse_gpio, }, { .parse_prop = parse_gpios, }, + { + .parse_prop = parse_post_init_providers, + .fwlink_flags = FWLINK_FLAG_IGNORE, + }, {} }; @@ -1393,7 +1402,7 @@ static int of_link_property(struct device_node *con_np, const char *prop_name) : of_node_get(con_np); matched = true; i++; - of_link_to_phandle(con_dev_np, phandle); + of_link_to_phandle(con_dev_np, phandle, s->fwlink_flags); of_node_put(phandle); of_node_put(con_dev_np); } From 32de4b4f9dfa67917d2cc824a195498513ec8e8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Tue, 5 Mar 2024 17:21:36 -0500 Subject: [PATCH 26/28] driver: core: Log probe failure as error and with device metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drivers can return -ENODEV or -ENXIO from their probe to reject a device match, and return -EPROBE_DEFER if probe should be retried. Any other error code is not expected during normal behavior and indicates an issue occurred, so it should be logged at the error level. Also make use of the device variant, dev_err(), so that the device metadata is attached to the log message. Signed-off-by: "Nícolas F. R. A. Prado" Link: https://lore.kernel.org/r/20240305-device-probe-error-v1-1-a06d8722bf19@collabora.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 85152537dbf1..0b7cf4516796 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -592,8 +592,8 @@ static int call_driver_probe(struct device *dev, struct device_driver *drv) break; default: /* driver matched but the probe failed */ - pr_warn("%s: probe of %s failed with error %d\n", - drv->name, dev_name(dev), ret); + dev_err(dev, "probe with driver %s failed with error %d\n", + drv->name, ret); break; } From 448af2d28899a2b4b1b07944b4910dfd5841bf55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Tue, 5 Mar 2024 17:21:37 -0500 Subject: [PATCH 27/28] driver: core: Use dev_* instead of pr_* so device metadata is added MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the dev_* instead of the pr_* functions to log the status of device probe so that the log message gets the device metadata attached to it. Signed-off-by: "Nícolas F. R. A. Prado" Link: https://lore.kernel.org/r/20240305-device-probe-error-v1-2-a06d8722bf19@collabora.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 0b7cf4516796..d6e7933e2521 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -397,13 +397,12 @@ bool device_is_bound(struct device *dev) static void driver_bound(struct device *dev) { if (device_is_bound(dev)) { - pr_warn("%s: device %s already bound\n", - __func__, kobject_name(&dev->kobj)); + dev_warn(dev, "%s: device already bound\n", __func__); return; } - pr_debug("driver: '%s': %s: bound to device '%s'\n", dev->driver->name, - __func__, dev_name(dev)); + dev_dbg(dev, "driver: '%s': %s: bound to device\n", dev->driver->name, + __func__); klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices); device_links_driver_bound(dev); @@ -587,8 +586,8 @@ static int call_driver_probe(struct device *dev, struct device_driver *drv) break; case -ENODEV: case -ENXIO: - pr_debug("%s: probe of %s rejects match %d\n", - drv->name, dev_name(dev), ret); + dev_dbg(dev, "probe with driver %s rejects match %d\n", + drv->name, ret); break; default: /* driver matched but the probe failed */ @@ -620,8 +619,8 @@ static int really_probe(struct device *dev, struct device_driver *drv) if (link_ret == -EPROBE_DEFER) return link_ret; - pr_debug("bus: '%s': %s: probing driver %s with device %s\n", - drv->bus->name, __func__, drv->name, dev_name(dev)); + dev_dbg(dev, "bus: '%s': %s: probing driver %s with device\n", + drv->bus->name, __func__, drv->name); if (!list_empty(&dev->devres_head)) { dev_crit(dev, "Resources present before probing\n"); ret = -EBUSY; @@ -644,8 +643,7 @@ re_probe: ret = driver_sysfs_add(dev); if (ret) { - pr_err("%s: driver_sysfs_add(%s) failed\n", - __func__, dev_name(dev)); + dev_err(dev, "%s: driver_sysfs_add failed\n", __func__); goto sysfs_failed; } @@ -706,8 +704,8 @@ re_probe: dev->pm_domain->sync(dev); driver_bound(dev); - pr_debug("bus: '%s': %s: bound device %s to driver %s\n", - drv->bus->name, __func__, dev_name(dev), drv->name); + dev_dbg(dev, "bus: '%s': %s: bound device to driver %s\n", + drv->bus->name, __func__, drv->name); goto done; dev_sysfs_state_synced_failed: @@ -786,8 +784,8 @@ static int __driver_probe_device(struct device_driver *drv, struct device *dev) return -EBUSY; dev->can_match = true; - pr_debug("bus: '%s': %s: matched device %s with driver %s\n", - drv->bus->name, __func__, dev_name(dev), drv->name); + dev_dbg(dev, "bus: '%s': %s: matched device with driver %s\n", + drv->bus->name, __func__, drv->name); pm_runtime_get_suppliers(dev); if (dev->parent) From 6aeb8850e0f39869d43768603a75c0431562a429 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Tue, 5 Mar 2024 17:21:38 -0500 Subject: [PATCH 28/28] device: core: Log warning for devices pending deferred probe on timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Once the deferred probe timeout has elapsed it is very likely that the devices that are still deferring probe won't ever be probed. Therefore log the defer probe pending reason at the warning level instead to bring attention to the issue. Signed-off-by: "Nícolas F. R. A. Prado" Link: https://lore.kernel.org/r/20240305-device-probe-error-v1-3-a06d8722bf19@collabora.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index d6e7933e2521..83d352394fdf 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -313,7 +313,7 @@ static void deferred_probe_timeout_work_func(struct work_struct *work) mutex_lock(&deferred_probe_mutex); list_for_each_entry(p, &deferred_probe_pending_list, deferred_probe) - dev_info(p->device, "deferred probe pending: %s", p->deferred_probe_reason ?: "(reason unknown)\n"); + dev_warn(p->device, "deferred probe pending: %s", p->deferred_probe_reason ?: "(reason unknown)\n"); mutex_unlock(&deferred_probe_mutex); fw_devlink_probing_done();