From 1211f3b21c2aa0d22d8d7f050e3a5930a91cd0e4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Mar 2024 07:21:02 -1000 Subject: [PATCH 01/14] workqueue: Preserve OFFQ bits in cancel[_sync] paths The cancel[_sync] paths acquire and release WORK_STRUCT_PENDING, and manipulate WORK_OFFQ_CANCELING. However, they assume that all the OFFQ bit values except for the pool ID are statically known and don't preserve them, which is not wrong in the current code as the pool ID and CANCELING are the only information carried. However, the planned disable/enable support will add more fields and need them to be preserved. This patch updates work data handling so that only the bits which need updating are updated. - struct work_offq_data is added along with work_offqd_unpack() and work_offqd_pack_flags() to help manipulating multiple fields contained in work->data. Note that the helpers look a bit silly right now as there isn't that much to pack. The next patch will add more. - mark_work_canceling() which is used only by __cancel_work_sync() is replaced by open-coded usage of work_offq_data and set_work_pool_and_keep_pending() in __cancel_work_sync(). - __cancel_work[_sync]() uses offq_data helpers to preserve other OFFQ bits when clearing WORK_STRUCT_PENDING and WORK_OFFQ_CANCELING at the end. - This removes all users of get_work_pool_id() which is dropped. Note that get_work_pool_id() could handle both WORK_STRUCT_PWQ and !WORK_STRUCT_PWQ cases; however, it was only being called after try_to_grab_pending() succeeded, in which case WORK_STRUCT_PWQ is never set and thus it's safe to use work_offqd_unpack() instead. No behavior changes intended. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 1 + kernel/workqueue.c | 53 +++++++++++++++++++++++---------------- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 158784dd189a..ae7ae4a51499 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -97,6 +97,7 @@ enum wq_misc_consts { /* Convenience constants - of type 'unsigned long', not 'enum'! */ #define WORK_OFFQ_CANCELING (1ul << WORK_OFFQ_CANCELING_BIT) +#define WORK_OFFQ_FLAG_MASK (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) #define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) #define WORK_STRUCT_PWQ_MASK (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0066c8f6c154..d8f37cfa9935 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -392,6 +392,11 @@ struct wq_pod_type { int *cpu_pod; /* cpu -> pod */ }; +struct work_offq_data { + u32 pool_id; + u32 flags; +}; + static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { [WQ_AFFN_DFL] = "default", [WQ_AFFN_CPU] = "cpu", @@ -892,29 +897,23 @@ static struct worker_pool *get_work_pool(struct work_struct *work) return idr_find(&worker_pool_idr, pool_id); } -/** - * get_work_pool_id - return the worker pool ID a given work is associated with - * @work: the work item of interest - * - * Return: The worker_pool ID @work was last associated with. - * %WORK_OFFQ_POOL_NONE if none. - */ -static int get_work_pool_id(struct work_struct *work) +static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits) { - unsigned long data = atomic_long_read(&work->data); - - if (data & WORK_STRUCT_PWQ) - return work_struct_pwq(data)->pool->id; - - return data >> WORK_OFFQ_POOL_SHIFT; + return (v >> shift) & ((1 << bits) - 1); } -static void mark_work_canceling(struct work_struct *work) +static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data) { - unsigned long pool_id = get_work_pool_id(work); + WARN_ON_ONCE(data & WORK_STRUCT_PWQ); - pool_id <<= WORK_OFFQ_POOL_SHIFT; - set_work_data(work, pool_id | WORK_STRUCT_PENDING | WORK_OFFQ_CANCELING); + offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT, + WORK_OFFQ_POOL_BITS); + offqd->flags = data & WORK_OFFQ_FLAG_MASK; +} + +static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd) +{ + return (unsigned long)offqd->flags; } static bool work_is_canceling(struct work_struct *work) @@ -4271,6 +4270,7 @@ EXPORT_SYMBOL(flush_rcu_work); static bool __cancel_work(struct work_struct *work, u32 cflags) { + struct work_offq_data offqd; unsigned long irq_flags; int ret; @@ -4281,19 +4281,26 @@ static bool __cancel_work(struct work_struct *work, u32 cflags) if (unlikely(ret < 0)) return false; - set_work_pool_and_clear_pending(work, get_work_pool_id(work), 0); + work_offqd_unpack(&offqd, *work_data_bits(work)); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); local_irq_restore(irq_flags); return ret; } static bool __cancel_work_sync(struct work_struct *work, u32 cflags) { + struct work_offq_data offqd; unsigned long irq_flags; bool ret; /* claim @work and tell other tasks trying to grab @work to back off */ ret = work_grab_pending(work, cflags, &irq_flags); - mark_work_canceling(work); + + work_offqd_unpack(&offqd, *work_data_bits(work)); + offqd.flags |= WORK_OFFQ_CANCELING; + set_work_pool_and_keep_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); local_irq_restore(irq_flags); /* @@ -4303,12 +4310,16 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags) if (wq_online) __flush_work(work, true); + work_offqd_unpack(&offqd, *work_data_bits(work)); + /* * smp_mb() at the end of set_work_pool_and_clear_pending() is paired * with prepare_to_wait() above so that either waitqueue_active() is * visible here or !work_is_canceling() is visible there. */ - set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE, 0); + offqd.flags &= ~WORK_OFFQ_CANCELING; + set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE, + work_offqd_pack_flags(&offqd)); if (waitqueue_active(&wq_cancel_waitq)) __wake_up(&wq_cancel_waitq, TASK_NORMAL, 1, work); From 86898fa6b8cd942505860556f3a0bf52eae57fe8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Mar 2024 07:21:03 -1000 Subject: [PATCH 02/14] workqueue: Implement disable/enable for (delayed) work items While (delayed) work items could be flushed and canceled, there was no way to prevent them from being queued in the future. While this didn't lead to functional deficiencies, it sometimes required a bit more effort from the workqueue users to e.g. sequence shutdown steps with more care. Workqueue is currently in the process of replacing tasklet which does support disabling and enabling. The feature is used relatively widely to, for example, temporarily suppress main path while a control plane operation (reset or config change) is in progress. To enable easy conversion of tasklet users and as it seems like an inherent useful feature, this patch implements disabling and enabling of work items. - A work item carries 16bit disable count in work->data while not queued. The access to the count is synchronized by the PENDING bit like all other parts of work->data. - If the count is non-zero, the work item cannot be queued. Any attempt to queue the work item fails and returns %false. - disable_work[_sync](), enable_work(), disable_delayed_work[_sync]() and enable_delayed_work() are added. v3: enable_work() was using local_irq_enable() instead of local_irq_restore() to undo IRQ-disable by work_grab_pending(). This is awkward now and will become incorrect as enable_work() will later be used from IRQ context too. (Lai) v2: Lai noticed that queue_work_node() wasn't checking the disable count. Fixed. queue_rcu_work() is updated to trigger warning if the inner work item is disabled. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 18 +++- kernel/workqueue.c | 177 +++++++++++++++++++++++++++++++++++--- 2 files changed, 182 insertions(+), 13 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index ae7ae4a51499..bd80e66298a0 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -51,20 +51,23 @@ enum work_bits { * data contains off-queue information when !WORK_STRUCT_PWQ. * * MSB - * [ pool ID ] [ OFFQ flags ] [ STRUCT flags ] - * 1 bit 4 or 5 bits + * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ] + * 16 bits 1 bit 4 or 5 bits */ WORK_OFFQ_FLAG_SHIFT = WORK_STRUCT_FLAG_BITS, WORK_OFFQ_CANCELING_BIT = WORK_OFFQ_FLAG_SHIFT, WORK_OFFQ_FLAG_END, WORK_OFFQ_FLAG_BITS = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT, + WORK_OFFQ_DISABLE_SHIFT = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS, + WORK_OFFQ_DISABLE_BITS = 16, + /* * When a work item is off queue, the high bits encode off-queue flags * and the last pool it was on. Cap pool ID to 31 bits and use the * highest number to indicate that no pool is associated. */ - WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS, + WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_DISABLE_SHIFT + WORK_OFFQ_DISABLE_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, }; @@ -98,6 +101,7 @@ enum wq_misc_consts { /* Convenience constants - of type 'unsigned long', not 'enum'! */ #define WORK_OFFQ_CANCELING (1ul << WORK_OFFQ_CANCELING_BIT) #define WORK_OFFQ_FLAG_MASK (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT) +#define WORK_OFFQ_DISABLE_MASK (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) #define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) #define WORK_STRUCT_PWQ_MASK (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1)) @@ -560,6 +564,14 @@ extern bool flush_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work_sync(struct delayed_work *dwork); +extern bool disable_work(struct work_struct *work); +extern bool disable_work_sync(struct work_struct *work); +extern bool enable_work(struct work_struct *work); + +extern bool disable_delayed_work(struct delayed_work *dwork); +extern bool disable_delayed_work_sync(struct delayed_work *dwork); +extern bool enable_delayed_work(struct delayed_work *dwork); + extern bool flush_rcu_work(struct rcu_work *rwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d8f37cfa9935..5c53dde877fd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -99,6 +99,7 @@ enum worker_flags { enum work_cancel_flags { WORK_CANCEL_DELAYED = 1 << 0, /* canceling a delayed_work */ + WORK_CANCEL_DISABLE = 1 << 1, /* canceling to disable */ }; enum wq_internal_consts { @@ -394,6 +395,7 @@ struct wq_pod_type { struct work_offq_data { u32 pool_id; + u32 disable; u32 flags; }; @@ -908,12 +910,15 @@ static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data) offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS); + offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT, + WORK_OFFQ_DISABLE_BITS); offqd->flags = data & WORK_OFFQ_FLAG_MASK; } static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd) { - return (unsigned long)offqd->flags; + return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) | + ((unsigned long)offqd->flags); } static bool work_is_canceling(struct work_struct *work) @@ -2408,6 +2413,21 @@ out: rcu_read_unlock(); } +static bool clear_pending_if_disabled(struct work_struct *work) +{ + unsigned long data = *work_data_bits(work); + struct work_offq_data offqd; + + if (likely((data & WORK_STRUCT_PWQ) || + !(data & WORK_OFFQ_DISABLE_MASK))) + return false; + + work_offqd_unpack(&offqd, data); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); + return true; +} + /** * queue_work_on - queue work on specific cpu * @cpu: CPU number to execute work on @@ -2430,7 +2450,8 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, local_irq_save(irq_flags); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !clear_pending_if_disabled(work)) { __queue_work(cpu, wq, work); ret = true; } @@ -2508,7 +2529,8 @@ bool queue_work_node(int node, struct workqueue_struct *wq, local_irq_save(irq_flags); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !clear_pending_if_disabled(work)) { int cpu = select_numa_node_cpu(node); __queue_work(cpu, wq, work); @@ -2590,7 +2612,8 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, /* read the comment in __queue_work() */ local_irq_save(irq_flags); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !clear_pending_if_disabled(work)) { __queue_delayed_work(cpu, wq, dwork, delay); ret = true; } @@ -2663,7 +2686,12 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) { struct work_struct *work = &rwork->work; - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + /* + * rcu_work can't be canceled or disabled. Warn if the user reached + * inside @rwork and disabled the inner work. + */ + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !WARN_ON_ONCE(clear_pending_if_disabled(work))) { rwork->wq = wq; call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); return true; @@ -4268,20 +4296,46 @@ bool flush_rcu_work(struct rcu_work *rwork) } EXPORT_SYMBOL(flush_rcu_work); +static void work_offqd_disable(struct work_offq_data *offqd) +{ + const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1; + + if (likely(offqd->disable < max)) + offqd->disable++; + else + WARN_ONCE(true, "workqueue: work disable count overflowed\n"); +} + +static void work_offqd_enable(struct work_offq_data *offqd) +{ + if (likely(offqd->disable > 0)) + offqd->disable--; + else + WARN_ONCE(true, "workqueue: work disable count underflowed\n"); +} + static bool __cancel_work(struct work_struct *work, u32 cflags) { struct work_offq_data offqd; unsigned long irq_flags; int ret; - do { - ret = try_to_grab_pending(work, cflags, &irq_flags); - } while (unlikely(ret == -EAGAIN)); + if (cflags & WORK_CANCEL_DISABLE) { + ret = work_grab_pending(work, cflags, &irq_flags); + } else { + do { + ret = try_to_grab_pending(work, cflags, &irq_flags); + } while (unlikely(ret == -EAGAIN)); - if (unlikely(ret < 0)) - return false; + if (unlikely(ret < 0)) + return false; + } work_offqd_unpack(&offqd, *work_data_bits(work)); + + if (cflags & WORK_CANCEL_DISABLE) + work_offqd_disable(&offqd); + set_work_pool_and_clear_pending(work, offqd.pool_id, work_offqd_pack_flags(&offqd)); local_irq_restore(irq_flags); @@ -4298,6 +4352,10 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags) ret = work_grab_pending(work, cflags, &irq_flags); work_offqd_unpack(&offqd, *work_data_bits(work)); + + if (cflags & WORK_CANCEL_DISABLE) + work_offqd_disable(&offqd); + offqd.flags |= WORK_OFFQ_CANCELING; set_work_pool_and_keep_pending(work, offqd.pool_id, work_offqd_pack_flags(&offqd)); @@ -4397,6 +4455,105 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) } EXPORT_SYMBOL(cancel_delayed_work_sync); +/** + * disable_work - Disable and cancel a work item + * @work: work item to disable + * + * Disable @work by incrementing its disable count and cancel it if currently + * pending. As long as the disable count is non-zero, any attempt to queue @work + * will fail and return %false. The maximum supported disable depth is 2 to the + * power of %WORK_OFFQ_DISABLE_BITS, currently 65536. + * + * Must be called from a sleepable context. Returns %true if @work was pending, + * %false otherwise. + */ +bool disable_work(struct work_struct *work) +{ + return __cancel_work(work, WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_work); + +/** + * disable_work_sync - Disable, cancel and drain a work item + * @work: work item to disable + * + * Similar to disable_work() but also wait for @work to finish if currently + * executing. + * + * Must be called from a sleepable context. Returns %true if @work was pending, + * %false otherwise. + */ +bool disable_work_sync(struct work_struct *work) +{ + return __cancel_work_sync(work, WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_work_sync); + +/** + * enable_work - Enable a work item + * @work: work item to enable + * + * Undo disable_work[_sync]() by decrementing @work's disable count. @work can + * only be queued if its disable count is 0. + * + * Must be called from a sleepable context. Returns %true if the disable count + * reached 0. Otherwise, %false. + */ +bool enable_work(struct work_struct *work) +{ + struct work_offq_data offqd; + unsigned long irq_flags; + + work_grab_pending(work, 0, &irq_flags); + + work_offqd_unpack(&offqd, *work_data_bits(work)); + work_offqd_enable(&offqd); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); + local_irq_restore(irq_flags); + + return !offqd.disable; +} +EXPORT_SYMBOL_GPL(enable_work); + +/** + * disable_delayed_work - Disable and cancel a delayed work item + * @dwork: delayed work item to disable + * + * disable_work() for delayed work items. + */ +bool disable_delayed_work(struct delayed_work *dwork) +{ + return __cancel_work(&dwork->work, + WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_delayed_work); + +/** + * disable_delayed_work_sync - Disable, cancel and drain a delayed work item + * @dwork: delayed work item to disable + * + * disable_work_sync() for delayed work items. + */ +bool disable_delayed_work_sync(struct delayed_work *dwork) +{ + return __cancel_work_sync(&dwork->work, + WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_delayed_work_sync); + +/** + * enable_delayed_work - Enable a delayed work item + * @dwork: delayed work item to enable + * + * enable_work() for delayed work items. + */ +bool enable_delayed_work(struct delayed_work *dwork) +{ + return enable_work(&dwork->work); +} +EXPORT_SYMBOL_GPL(enable_delayed_work); + /** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call From f09b10b6f442656524d2ee26e45966401a14f54b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Mar 2024 07:21:03 -1000 Subject: [PATCH 03/14] workqueue: Remove WORK_OFFQ_CANCELING cancel[_delayed]_work_sync() guarantees that it can shut down self-requeueing work items. To achieve that, it grabs and then holds WORK_STRUCT_PENDING bit set while flushing the currently executing instance. As the PENDING bit is set, all queueing attempts including the self-requeueing ones fail and once the currently executing instance is flushed, the work item should be idle as long as someone else isn't actively queueing it. This means that the cancel_work_sync path may hold the PENDING bit set while flushing the target work item. This isn't a problem for the queueing path - it can just fail which is the desired effect. It doesn't affect flush. It doesn't matter to cancel_work either as it can just report that the work item has successfully canceled. However, if there's another cancel_work_sync attempt on the work item, it can't simply fail or report success and that would breach the guarantee that it should provide. cancel_work_sync has to wait for and grab that PENDING bit and go through the motions. WORK_OFFQ_CANCELING and wq_cancel_waitq are what implement this cancel_work_sync to cancel_work_sync wait mechanism. When a work item is being canceled, WORK_OFFQ_CANCELING is also set on it and other cancel_work_sync attempts wait on the bit to be cleared using the wait queue. While this works, it's an isolated wart which doesn't jive with the rest of flush and cancel mechanisms and forces enable_work() and disable_work() to require a sleepable context, which hampers their usability. Now that a work item can be disabled, we can use that to block queueing while cancel_work_sync is in progress. Instead of holding PENDING the bit, it can temporarily disable the work item, flush and then re-enable it as that'd achieve the same end result of blocking queueings while canceling and thus enable canceling of self-requeueing work items. - WORK_OFFQ_CANCELING and the surrounding mechanims are removed. - work_grab_pending() is now simpler, no longer has to wait for a blocking operation and thus can be called from any context. - With work_grab_pending() simplified, no need to use try_to_grab_pending() directly. All users are converted to use work_grab_pending(). - __cancel_work_sync() is updated to __cancel_work() with WORK_CANCEL_DISABLE to cancel and plug racing queueing attempts. It then flushes and re-enables the work item if necessary. - These changes allow disable_work() and enable_work() to be called from any context. v2: Lai pointed out that mod_delayed_work_on() needs to check the disable count before queueing the delayed work item. Added clear_pending_if_disabled() call. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 4 +- kernel/workqueue.c | 140 ++++++-------------------------------- 2 files changed, 20 insertions(+), 124 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index bd80e66298a0..a5075969931b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -52,10 +52,9 @@ enum work_bits { * * MSB * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ] - * 16 bits 1 bit 4 or 5 bits + * 16 bits 0 bits 4 or 5 bits */ WORK_OFFQ_FLAG_SHIFT = WORK_STRUCT_FLAG_BITS, - WORK_OFFQ_CANCELING_BIT = WORK_OFFQ_FLAG_SHIFT, WORK_OFFQ_FLAG_END, WORK_OFFQ_FLAG_BITS = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT, @@ -99,7 +98,6 @@ enum wq_misc_consts { }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ -#define WORK_OFFQ_CANCELING (1ul << WORK_OFFQ_CANCELING_BIT) #define WORK_OFFQ_FLAG_MASK (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT) #define WORK_OFFQ_DISABLE_MASK (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5c53dde877fd..e80a815ec172 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -496,12 +496,6 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; /* I: attributes used when instantiating ordered pools on demand */ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; -/* - * Used to synchronize multiple cancel_sync attempts on the same work item. See - * work_grab_pending() and __cancel_work_sync(). - */ -static DECLARE_WAIT_QUEUE_HEAD(wq_cancel_waitq); - /* * I: kthread_worker to release pwq's. pwq release needs to be bounced to a * process context while holding a pool lock. Bounce to a dedicated kthread @@ -783,11 +777,6 @@ static int work_next_color(int color) * corresponding to a work. Pool is available once the work has been * queued anywhere after initialization until it is sync canceled. pwq is * available only while the work item is queued. - * - * %WORK_OFFQ_CANCELING is used to mark a work item which is being - * canceled. While being canceled, a work item may have its PENDING set - * but stay off timer and worklist for arbitrarily long and nobody should - * try to steal the PENDING bit. */ static inline void set_work_data(struct work_struct *work, unsigned long data) { @@ -921,13 +910,6 @@ static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd) ((unsigned long)offqd->flags); } -static bool work_is_canceling(struct work_struct *work) -{ - unsigned long data = atomic_long_read(&work->data); - - return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING); -} - /* * Policy functions. These define the policies on how the global worker * pools are managed. Unless noted otherwise, these functions assume that @@ -2058,8 +2040,6 @@ out_put: * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry - * -ENOENT if someone else is canceling @work, this state may persist - * for arbitrarily long * ======== ================================================================ * * Note: @@ -2155,26 +2135,9 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, fail: rcu_read_unlock(); local_irq_restore(*irq_flags); - if (work_is_canceling(work)) - return -ENOENT; - cpu_relax(); return -EAGAIN; } -struct cwt_wait { - wait_queue_entry_t wait; - struct work_struct *work; -}; - -static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) -{ - struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); - - if (cwait->work != key) - return 0; - return autoremove_wake_function(wait, mode, sync, key); -} - /** * work_grab_pending - steal work item from worklist and disable irq * @work: work item to steal @@ -2184,7 +2147,7 @@ static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *k * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer * or on worklist. * - * Must be called in process context. IRQ is disabled on return with IRQ state + * Can be called from any context. IRQ is disabled on return with IRQ state * stored in *@irq_flags. The caller is responsible for re-enabling it using * local_irq_restore(). * @@ -2193,41 +2156,14 @@ static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *k static bool work_grab_pending(struct work_struct *work, u32 cflags, unsigned long *irq_flags) { - struct cwt_wait cwait; int ret; - might_sleep(); -repeat: - ret = try_to_grab_pending(work, cflags, irq_flags); - if (likely(ret >= 0)) - return ret; - if (ret != -ENOENT) - goto repeat; - - /* - * Someone is already canceling. Wait for it to finish. flush_work() - * doesn't work for PREEMPT_NONE because we may get woken up between - * @work's completion and the other canceling task resuming and clearing - * CANCELING - flush_work() will return false immediately as @work is no - * longer busy, try_to_grab_pending() will return -ENOENT as @work is - * still being canceled and the other canceling task won't be able to - * clear CANCELING as we're hogging the CPU. - * - * Let's wait for completion using a waitqueue. As this may lead to the - * thundering herd problem, use a custom wake function which matches - * @work along with exclusive wait and wakeup. - */ - init_wait(&cwait.wait); - cwait.wait.func = cwt_wakefn; - cwait.work = work; - - prepare_to_wait_exclusive(&wq_cancel_waitq, &cwait.wait, - TASK_UNINTERRUPTIBLE); - if (work_is_canceling(work)) - schedule(); - finish_wait(&wq_cancel_waitq, &cwait.wait); - - goto repeat; + while (true) { + ret = try_to_grab_pending(work, cflags, irq_flags); + if (ret >= 0) + return ret; + cpu_relax(); + } } /** @@ -2645,19 +2581,14 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { unsigned long irq_flags; - int ret; + bool ret; - do { - ret = try_to_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, - &irq_flags); - } while (unlikely(ret == -EAGAIN)); + ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags); - if (likely(ret >= 0)) { + if (!clear_pending_if_disabled(&dwork->work)) __queue_delayed_work(cpu, wq, dwork, delay); - local_irq_restore(irq_flags); - } - /* -ENOENT from try_to_grab_pending() becomes %true */ + local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL_GPL(mod_delayed_work_on); @@ -4320,16 +4251,7 @@ static bool __cancel_work(struct work_struct *work, u32 cflags) unsigned long irq_flags; int ret; - if (cflags & WORK_CANCEL_DISABLE) { - ret = work_grab_pending(work, cflags, &irq_flags); - } else { - do { - ret = try_to_grab_pending(work, cflags, &irq_flags); - } while (unlikely(ret == -EAGAIN)); - - if (unlikely(ret < 0)) - return false; - } + ret = work_grab_pending(work, cflags, &irq_flags); work_offqd_unpack(&offqd, *work_data_bits(work)); @@ -4344,22 +4266,9 @@ static bool __cancel_work(struct work_struct *work, u32 cflags) static bool __cancel_work_sync(struct work_struct *work, u32 cflags) { - struct work_offq_data offqd; - unsigned long irq_flags; bool ret; - /* claim @work and tell other tasks trying to grab @work to back off */ - ret = work_grab_pending(work, cflags, &irq_flags); - - work_offqd_unpack(&offqd, *work_data_bits(work)); - - if (cflags & WORK_CANCEL_DISABLE) - work_offqd_disable(&offqd); - - offqd.flags |= WORK_OFFQ_CANCELING; - set_work_pool_and_keep_pending(work, offqd.pool_id, - work_offqd_pack_flags(&offqd)); - local_irq_restore(irq_flags); + ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE); /* * Skip __flush_work() during early boot when we know that @work isn't @@ -4368,19 +4277,8 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags) if (wq_online) __flush_work(work, true); - work_offqd_unpack(&offqd, *work_data_bits(work)); - - /* - * smp_mb() at the end of set_work_pool_and_clear_pending() is paired - * with prepare_to_wait() above so that either waitqueue_active() is - * visible here or !work_is_canceling() is visible there. - */ - offqd.flags &= ~WORK_OFFQ_CANCELING; - set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE, - work_offqd_pack_flags(&offqd)); - - if (waitqueue_active(&wq_cancel_waitq)) - __wake_up(&wq_cancel_waitq, TASK_NORMAL, 1, work); + if (!(cflags & WORK_CANCEL_DISABLE)) + enable_work(work); return ret; } @@ -4464,8 +4362,8 @@ EXPORT_SYMBOL(cancel_delayed_work_sync); * will fail and return %false. The maximum supported disable depth is 2 to the * power of %WORK_OFFQ_DISABLE_BITS, currently 65536. * - * Must be called from a sleepable context. Returns %true if @work was pending, - * %false otherwise. + * Can be called from any context. Returns %true if @work was pending, %false + * otherwise. */ bool disable_work(struct work_struct *work) { @@ -4496,8 +4394,8 @@ EXPORT_SYMBOL_GPL(disable_work_sync); * Undo disable_work[_sync]() by decrementing @work's disable count. @work can * only be queued if its disable count is 0. * - * Must be called from a sleepable context. Returns %true if the disable count - * reached 0. Otherwise, %false. + * Can be called from any context. Returns %true if the disable count reached 0. + * Otherwise, %false. */ bool enable_work(struct work_struct *work) { From 456a78eef2670d0e9521e87f35a056de8fec7fb2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Mar 2024 07:21:03 -1000 Subject: [PATCH 04/14] workqueue: Remember whether a work item was on a BH workqueue Add an off-queue flag, WORK_OFFQ_BH, that indicates whether the last workqueue the work item was on was a BH one. This will be used to test whether a work item is BH in cancel_sync path to implement atomic cancel_sync'ing for BH work items. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 4 +++- kernel/workqueue.c | 10 ++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a5075969931b..777b0186317e 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -52,9 +52,10 @@ enum work_bits { * * MSB * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ] - * 16 bits 0 bits 4 or 5 bits + * 16 bits 1 bit 4 or 5 bits */ WORK_OFFQ_FLAG_SHIFT = WORK_STRUCT_FLAG_BITS, + WORK_OFFQ_BH_BIT = WORK_OFFQ_FLAG_SHIFT, WORK_OFFQ_FLAG_END, WORK_OFFQ_FLAG_BITS = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT, @@ -98,6 +99,7 @@ enum wq_misc_consts { }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ +#define WORK_OFFQ_BH (1ul << WORK_OFFQ_BH_BIT) #define WORK_OFFQ_FLAG_MASK (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT) #define WORK_OFFQ_DISABLE_MASK (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e80a815ec172..baf7495338bc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -764,6 +764,11 @@ static int work_next_color(int color) return (color + 1) % WORK_NR_COLORS; } +static unsigned long pool_offq_flags(struct worker_pool *pool) +{ + return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0; +} + /* * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data * contain the pointer to the queued pwq. Once execution starts, the flag @@ -2122,7 +2127,8 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, * this destroys work->data needed by the next step, stash it. */ work_data = *work_data_bits(work); - set_work_pool_and_keep_pending(work, pool->id, 0); + set_work_pool_and_keep_pending(work, pool->id, + pool_offq_flags(pool)); /* must be the last step, see the function comment */ pwq_dec_nr_in_flight(pwq, work_data); @@ -3175,7 +3181,7 @@ __acquires(&pool->lock) * PENDING and queued state changes happen together while IRQ is * disabled. */ - set_work_pool_and_clear_pending(work, pool->id, 0); + set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool)); pwq->stats[PWQ_STAT_STARTED]++; raw_spin_unlock_irq(&pool->lock); From 134874e2eee9380c2700411d4844cbc29297bc01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Mar 2024 07:21:03 -1000 Subject: [PATCH 05/14] workqueue: Allow cancel_work_sync() and disable_work() from atomic contexts on BH work items Now that work_grab_pending() can always grab the PENDING bit without sleeping, the only thing that prevents allowing cancel_work_sync() of a BH work item from an atomic context is the flushing of the in-flight instance. When we're flushing a BH work item for cancel_work_sync(), we know that the work item is not queued and must be executing in a BH context, which means that it's safe to busy-wait for its completion from a non-hardirq atomic context. This patch updates __flush_work() so that it busy-waits when flushing a BH work item for cancel_work_sync(). might_sleep() is pushed from start_flush_work() to its callers - when operating on a BH work item, __cancel_work_sync() now enforces !in_hardirq() instead of might_sleep(). This allows cancel_work_sync() and disable_work() to be called from non-hardirq atomic contexts on BH work items. v3: In __flush_work(), test WORK_OFFQ_BH to tell whether a work item being canceled can be busy waited instead of making start_flush_work() return the pool. (Lai) v2: Lai pointed out that __flush_work() was accessing pool->flags outside the RCU critical section protecting the pool pointer. Fix it by testing and remembering the result inside the RCU critical section. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 74 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 19 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index baf7495338bc..c0cc8b209d5c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4105,8 +4105,6 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, struct pool_workqueue *pwq; struct workqueue_struct *wq; - might_sleep(); - rcu_read_lock(); pool = get_work_pool(work); if (!pool) { @@ -4158,6 +4156,7 @@ already_gone: static bool __flush_work(struct work_struct *work, bool from_cancel) { struct wq_barrier barr; + unsigned long data; if (WARN_ON(!wq_online)) return false; @@ -4165,13 +4164,41 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!work->func)) return false; - if (start_flush_work(work, &barr, from_cancel)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { + if (!start_flush_work(work, &barr, from_cancel)) return false; + + /* + * start_flush_work() returned %true. If @from_cancel is set, we know + * that @work must have been executing during start_flush_work() and + * can't currently be queued. Its data must contain OFFQ bits. If @work + * was queued on a BH workqueue, we also know that it was running in the + * BH context and thus can be busy-waited. + */ + data = *work_data_bits(work); + if (from_cancel && + !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) { + /* + * On RT, prevent a live lock when %current preempted soft + * interrupt processing or prevents ksoftirqd from running by + * keeping flipping BH. If the BH work item runs on a different + * CPU then this has no effect other than doing the BH + * disable/enable dance for nothing. This is copied from + * kernel/softirq.c::tasklet_unlock_spin_wait(). + */ + while (!try_wait_for_completion(&barr.done)) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + local_bh_disable(); + local_bh_enable(); + } else { + cpu_relax(); + } + } + } else { + wait_for_completion(&barr.done); } + + destroy_work_on_stack(&barr.work); + return true; } /** @@ -4187,6 +4214,7 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) */ bool flush_work(struct work_struct *work) { + might_sleep(); return __flush_work(work, false); } EXPORT_SYMBOL_GPL(flush_work); @@ -4276,6 +4304,11 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags) ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE); + if (*work_data_bits(work) & WORK_OFFQ_BH) + WARN_ON_ONCE(in_hardirq()); + else + might_sleep(); + /* * Skip __flush_work() during early boot when we know that @work isn't * executing. This allows canceling during early boot. @@ -4302,19 +4335,19 @@ EXPORT_SYMBOL(cancel_work); * cancel_work_sync - cancel a work and wait for it to finish * @work: the work to cancel * - * Cancel @work and wait for its execution to finish. This function - * can be used even if the work re-queues itself or migrates to - * another workqueue. On return from this function, @work is - * guaranteed to be not pending or executing on any CPU. + * Cancel @work and wait for its execution to finish. This function can be used + * even if the work re-queues itself or migrates to another workqueue. On return + * from this function, @work is guaranteed to be not pending or executing on any + * CPU as long as there aren't racing enqueues. * - * cancel_work_sync(&delayed_work->work) must not be used for - * delayed_work's. Use cancel_delayed_work_sync() instead. + * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's. + * Use cancel_delayed_work_sync() instead. * - * The caller must ensure that the workqueue on which @work was last - * queued can't be destroyed before this function returns. + * Must be called from a sleepable context if @work was last queued on a non-BH + * workqueue. Can also be called from non-hardirq atomic contexts including BH + * if @work was last queued on a BH workqueue. * - * Return: - * %true if @work was pending, %false otherwise. + * Returns %true if @work was pending, %false otherwise. */ bool cancel_work_sync(struct work_struct *work) { @@ -4384,8 +4417,11 @@ EXPORT_SYMBOL_GPL(disable_work); * Similar to disable_work() but also wait for @work to finish if currently * executing. * - * Must be called from a sleepable context. Returns %true if @work was pending, - * %false otherwise. + * Must be called from a sleepable context if @work was last queued on a non-BH + * workqueue. Can also be called from non-hardirq atomic contexts including BH + * if @work was last queued on a BH workqueue. + * + * Returns %true if @work was pending, %false otherwise. */ bool disable_work_sync(struct work_struct *work) { From e7cc3be6fdb57d98fc399a856fc3b05cce1ca754 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 8 Mar 2024 17:42:50 +0800 Subject: [PATCH 06/14] workqueue: Use INIT_WORK_ONSTACK in workqueue_softirq_dead() dead_work is a stack variable. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c0cc8b209d5c..45d2aae73c96 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3651,7 +3651,7 @@ void workqueue_softirq_dead(unsigned int cpu) if (!need_more_worker(pool)) continue; - INIT_WORK(&dead_work.work, drain_dead_softirq_workfn); + INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn); dead_work.pool = pool; init_completion(&dead_work.done); From ae1296a7bfe4f8e446677ccb761d9419926557bc Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 8 Mar 2024 17:42:52 +0800 Subject: [PATCH 07/14] workqueue: Move attrs->cpumask out of worker_pool's properties when attrs->affn_strict Allow more pools can be shared when attrs->affn_strict. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 13 ++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 777b0186317e..bfcf8d38f4b1 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -185,6 +185,9 @@ struct workqueue_attrs { * Below fields aren't properties of a worker_pool. They only modify how * :c:func:`apply_workqueue_attrs` select pools and thus don't * participate in pool hash calculations or equality comparisons. + * + * If @affn_strict is set, @cpumask isn't a property of a worker_pool + * either. */ /** diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 45d2aae73c96..f03960f094fa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4625,6 +4625,8 @@ static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) { attrs->affn_scope = WQ_AFFN_NR_TYPES; attrs->ordered = false; + if (attrs->affn_strict) + cpumask_copy(attrs->cpumask, cpu_possible_mask); } /* hash value of the content of @attr */ @@ -4633,11 +4635,12 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) u32 hash = 0; hash = jhash_1word(attrs->nice, hash); - hash = jhash(cpumask_bits(attrs->cpumask), - BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash_1word(attrs->affn_strict, hash); hash = jhash(cpumask_bits(attrs->__pod_cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); - hash = jhash_1word(attrs->affn_strict, hash); + if (!attrs->affn_strict) + hash = jhash(cpumask_bits(attrs->cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); return hash; } @@ -4647,11 +4650,11 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, { if (a->nice != b->nice) return false; - if (!cpumask_equal(a->cpumask, b->cpumask)) + if (a->affn_strict != b->affn_strict) return false; if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) return false; - if (a->affn_strict != b->affn_strict) + if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask)) return false; return true; } From d70f5d5778e88addcc3e56858d5e9c635c1e420e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 8 Mar 2024 17:42:53 +0800 Subject: [PATCH 08/14] workqueue: Use list_last_entry() to get the last idle worker It is clearer than open code. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f03960f094fa..d1ccc3d05b7a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2904,7 +2904,7 @@ static void idle_worker_timeout(struct timer_list *t) unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ - worker = list_entry(pool->idle_list.prev, struct worker, entry); + worker = list_last_entry(&pool->idle_list, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; do_cull = !time_before(jiffies, expires); @@ -2946,7 +2946,7 @@ static void idle_cull_fn(struct work_struct *work) struct worker *worker; unsigned long expires; - worker = list_entry(pool->idle_list.prev, struct worker, entry); + worker = list_last_entry(&pool->idle_list, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { From 79202591a55a365251496162ced3004a0a1fa1cf Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 7 Mar 2024 21:39:32 -0800 Subject: [PATCH 09/14] workqueue: Cleanup subsys attribute registration While reviewing users of subsys_virtual_register() I noticed that wq_sysfs_init() ignores the @groups argument. This looks like a historical artifact as the original wq_subsys only had one attribute to register. On the way to building up an @groups argument to pass to subsys_virtual_register() a few more cleanups fell out: * Use DEVICE_ATTR_RO() and DEVICE_ATTR_RW() for cpumask_{isolated,requested} and cpumask respectively. Rename the @show and @store methods accordingly. * Co-locate the attribute definition with the methods. This required moving wq_unbound_cpumask_show down next to wq_unbound_cpumask_store (renamed to cpumask_show() and cpumask_store()) * Use ATTRIBUTE_GROUPS() to skip some boilerplate declarations Signed-off-by: Dan Williams Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 63 ++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 39 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d1ccc3d05b7a..a8cbaede1e22 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7246,25 +7246,27 @@ static ssize_t __wq_cpumask_show(struct device *dev, return written; } -static ssize_t wq_unbound_cpumask_show(struct device *dev, +static ssize_t cpumask_requested_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); +} +static DEVICE_ATTR_RO(cpumask_requested); + +static ssize_t cpumask_isolated_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); +} +static DEVICE_ATTR_RO(cpumask_isolated); + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); } -static ssize_t wq_requested_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); -} - -static ssize_t wq_isolated_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); -} - -static ssize_t wq_unbound_cpumask_store(struct device *dev, +static ssize_t cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { cpumask_var_t cpumask; @@ -7280,36 +7282,19 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev, free_cpumask_var(cpumask); return ret ? ret : count; } +static DEVICE_ATTR_RW(cpumask); -static struct device_attribute wq_sysfs_cpumask_attrs[] = { - __ATTR(cpumask, 0644, wq_unbound_cpumask_show, - wq_unbound_cpumask_store), - __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL), - __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL), - __ATTR_NULL, +static struct attribute *wq_sysfs_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + &dev_attr_cpumask_requested.attr, + &dev_attr_cpumask_isolated.attr, + NULL, }; +ATTRIBUTE_GROUPS(wq_sysfs_cpumask); static int __init wq_sysfs_init(void) { - struct device *dev_root; - int err; - - err = subsys_virtual_register(&wq_subsys, NULL); - if (err) - return err; - - dev_root = bus_get_dev_root(&wq_subsys); - if (dev_root) { - struct device_attribute *attr; - - for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) { - err = device_create_file(dev_root, attr); - if (err) - break; - } - put_device(dev_root); - } - return err; + return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups); } core_initcall(wq_sysfs_init); From d6a7bbdde67227127e5e33fb9500bcc4abc40fb3 Mon Sep 17 00:00:00 2001 From: Kassey Li Date: Fri, 8 Mar 2024 10:18:18 +0800 Subject: [PATCH 10/14] workqueue: add function in event of workqueue_activate_work The trace event "workqueue_activate_work" only print work struct. However, function is the region of interest in a full sequence of work. Current workqueue_activate_work trace event output: workqueue_activate_work: work struct ffffff88b4a0f450 With this change, workqueue_activate_work will print the function name, align with workqueue_queue_work/execute_start/execute_end event. workqueue_activate_work: work struct ffffff80413a78b8 function=vmstat_update Signed-off-by: Kassey Li Reviewed-by: Steven Rostedt (Google) Signed-off-by: Tejun Heo --- include/trace/events/workqueue.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h index 262d52021c23..6ef5b7254070 100644 --- a/include/trace/events/workqueue.h +++ b/include/trace/events/workqueue.h @@ -64,13 +64,15 @@ TRACE_EVENT(workqueue_activate_work, TP_STRUCT__entry( __field( void *, work ) + __field( void *, function) ), TP_fast_assign( __entry->work = work; + __entry->function = work->func; ), - TP_printk("work struct %p", __entry->work) + TP_printk("work struct %p function=%ps ", __entry->work, __entry->function) ); /** From 474a549ff4c989427a14fdab851e562c8a63fe24 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 25 Mar 2024 18:02:01 +0000 Subject: [PATCH 11/14] workqueue: Introduce enable_and_queue_work() convenience function The enable_and_queue_work() function is introduced to streamline the process of enabling and queuing a work item on a specific workqueue. This function combines the functionalities of enable_work() and queue_work() in a single call, providing a concise and convenient API for enabling and queuing work items. The function accepts a target workqueue and a work item as parameters. It first attempts to enable the work item using enable_work(). A successful enable operation means that the work item was previously disabled and is now marked as eligible for execution. If the enable operation is successful, the work item is then queued on the specified workqueue using queue_work(). The function returns true if the work item was successfully enabled and queued, and false otherwise. Note: This function may lead to unnecessary spurious wake-ups in cases where the work item is expected to be dormant but enable/disable are called frequently. Spurious wake-ups refer to the condition where worker threads are woken up without actual work to be done. Callers should be aware of this behavior and may need to employ additional synchronization mechanisms to avoid these overheads if such wake-ups are not desired. This addition aims to enhance code readability and maintainability by providing a unified interface for the common use case of enabling and queuing work items on a workqueue. tj: Made the function comment more compact. Signed-off-by: Allen Pais Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index bfcf8d38f4b1..2df1188c0f96 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -682,6 +682,32 @@ static inline bool schedule_work(struct work_struct *work) return queue_work(system_wq, work); } +/** + * enable_and_queue_work - Enable and queue a work item on a specific workqueue + * @wq: The target workqueue + * @work: The work item to be enabled and queued + * + * This function combines the operations of enable_work() and queue_work(), + * providing a convenient way to enable and queue a work item in a single call. + * It invokes enable_work() on @work and then queues it if the disable depth + * reached 0. Returns %true if the disable depth reached 0 and @work is queued, + * and %false otherwise. + * + * Note that @work is always queued when disable depth reaches zero. If the + * desired behavior is queueing only if certain events took place while @work is + * disabled, the user should implement the necessary state tracking and perform + * explicit conditional queueing after enable_work(). + */ +static inline bool enable_and_queue_work(struct workqueue_struct *wq, + struct work_struct *work) +{ + if (enable_work(work)) { + queue_work(wq, work); + return true; + } + return false; +} + /* * Detect attempt to flush system-wide workqueues at compile time when possible. * Warn attempt to flush system-wide workqueues at runtime. From 8034b31464c53d6e182f65a293a87b50ddf6dd7e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 21 Mar 2024 23:04:20 +0800 Subject: [PATCH 12/14] workqueue: remove unnecessary import and function in wq_monitor.py Remove unnecessary import and function in wq_monitor.py Signed-off-by: Kemeng Shi Signed-off-by: Tejun Heo --- tools/workqueue/wq_monitor.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tools/workqueue/wq_monitor.py b/tools/workqueue/wq_monitor.py index a8856a9c45dc..9e964c5be40c 100644 --- a/tools/workqueue/wq_monitor.py +++ b/tools/workqueue/wq_monitor.py @@ -32,16 +32,13 @@ https://github.com/osandov/drgn. rescued The number of work items executed by the rescuer. """ -import sys import signal -import os import re import time import json import drgn -from drgn.helpers.linux.list import list_for_each_entry,list_empty -from drgn.helpers.linux.cpumask import for_each_possible_cpu +from drgn.helpers.linux.list import list_for_each_entry import argparse parser = argparse.ArgumentParser(description=desc, @@ -54,10 +51,6 @@ parser.add_argument('-j', '--json', action='store_true', help='Output in json') args = parser.parse_args() -def err(s): - print(s, file=sys.stderr, flush=True) - sys.exit(1) - workqueues = prog['workqueues'] WQ_UNBOUND = prog['WQ_UNBOUND'] From 31103f40b1b5d4382446b4d5af37e61dce31f8d5 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 8 Apr 2024 16:44:04 +0800 Subject: [PATCH 13/14] workqueue: Add destroy_work_on_stack() in workqueue_softirq_dead() This commit add missed destroy_work_on_stack() operations for dead_work.work. Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a8cbaede1e22..3c3154b40698 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3661,6 +3661,7 @@ void workqueue_softirq_dead(unsigned int cpu) queue_work(system_bh_wq, &dead_work.work); wait_for_completion(&dead_work.done); + destroy_work_on_stack(&dead_work.work); } } From 51da7f68edae38e81543d57fd71811f7481c0472 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Apr 2024 10:03:13 -1000 Subject: [PATCH 14/14] workqueue: Use "@..." in function comment to describe variable length argument Previously, it was using "remaining args" without leading "@" which isn't valid. Let's follow snprintf()'s example and use "@...". Signed-off-by: Tejun Heo Reported-by: Stephen Rothwell --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 2df1188c0f96..fb3993894536 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -473,7 +473,7 @@ void workqueue_softirq_dead(unsigned int cpu); * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags * @max_active: max in-flight work items, 0 for default - * remaining args: args for @fmt + * @...: args for @fmt * * For a per-cpu workqueue, @max_active limits the number of in-flight work * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be