mirror of
https://github.com/torvalds/linux.git
synced 2024-11-10 06:01:57 +00:00
blk-mq: remove hybrid polling
io_uring provides the only way user space can poll completions, and that always sets BLK_POLL_NOSLEEP. This effectively makes hybrid polling dead code, so remove it and everything supporting it. Hybrid polling was effectively killed off with9650b453a3
, "block: ignore RWF_HIPRI hint for sync dio", but still potentially reachable through io_uring untild729cf9acb
, "io_uring: don't sleep when polling for I/O", but hybrid polling probably should not have been reachable through that async interface from the beginning. Fixes:9650b453a3
("block: ignore RWF_HIPRI hint for sync dio") Fixes:d729cf9acb
("io_uring: don't sleep when polling for I/O") Signed-off-by: Keith Busch <kbusch@kernel.org> Link: https://lore.kernel.org/r/20230320194926.3353144-1-kbusch@meta.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
4cf2c3ab2c
commit
54bdd67d0f
@ -336,18 +336,11 @@ What: /sys/block/<disk>/queue/io_poll_delay
|
||||
Date: November 2016
|
||||
Contact: linux-block@vger.kernel.org
|
||||
Description:
|
||||
[RW] If polling is enabled, this controls what kind of polling
|
||||
will be performed. It defaults to -1, which is classic polling.
|
||||
[RW] This was used to control what kind of polling will be
|
||||
performed. It is now fixed to -1, which is classic polling.
|
||||
In this mode, the CPU will repeatedly ask for completions
|
||||
without giving up any time. If set to 0, a hybrid polling mode
|
||||
is used, where the kernel will attempt to make an educated guess
|
||||
at when the IO will complete. Based on this guess, the kernel
|
||||
will put the process issuing IO to sleep for an amount of time,
|
||||
before entering a classic poll loop. This mode might be a little
|
||||
slower than pure classic polling, but it will be more efficient.
|
||||
If set to a value larger than 0, the kernel will put the process
|
||||
issuing IO to sleep for this amount of microseconds before
|
||||
entering classic polling.
|
||||
without giving up any time.
|
||||
<deprecated>
|
||||
|
||||
|
||||
What: /sys/block/<disk>/queue/io_timeout
|
||||
|
@ -263,13 +263,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
|
||||
|
||||
static void blk_free_queue(struct request_queue *q)
|
||||
{
|
||||
if (q->poll_stat)
|
||||
blk_stat_remove_callback(q, q->poll_cb);
|
||||
blk_stat_free_callback(q->poll_cb);
|
||||
|
||||
blk_free_queue_stats(q->stats);
|
||||
kfree(q->poll_stat);
|
||||
|
||||
if (queue_is_mq(q))
|
||||
blk_mq_release(q);
|
||||
|
||||
|
@ -15,33 +15,8 @@
|
||||
#include "blk-mq-tag.h"
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
|
||||
{
|
||||
if (stat->nr_samples) {
|
||||
seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu",
|
||||
stat->nr_samples, stat->mean, stat->min, stat->max);
|
||||
} else {
|
||||
seq_puts(m, "samples=0");
|
||||
}
|
||||
}
|
||||
|
||||
static int queue_poll_stat_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct request_queue *q = data;
|
||||
int bucket;
|
||||
|
||||
if (!q->poll_stat)
|
||||
return 0;
|
||||
|
||||
for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
|
||||
seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket));
|
||||
print_stat(m, &q->poll_stat[2 * bucket]);
|
||||
seq_puts(m, "\n");
|
||||
|
||||
seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket));
|
||||
print_stat(m, &q->poll_stat[2 * bucket + 1]);
|
||||
seq_puts(m, "\n");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -282,7 +257,6 @@ static const char *const rqf_name[] = {
|
||||
RQF_NAME(STATS),
|
||||
RQF_NAME(SPECIAL_PAYLOAD),
|
||||
RQF_NAME(ZONE_WRITE_LOCKED),
|
||||
RQF_NAME(MQ_POLL_SLEPT),
|
||||
RQF_NAME(TIMED_OUT),
|
||||
RQF_NAME(ELV),
|
||||
RQF_NAME(RESV),
|
||||
|
205
block/blk-mq.c
205
block/blk-mq.c
@ -46,51 +46,15 @@
|
||||
|
||||
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
|
||||
|
||||
static void blk_mq_poll_stats_start(struct request_queue *q);
|
||||
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
|
||||
|
||||
static int blk_mq_poll_stats_bkt(const struct request *rq)
|
||||
{
|
||||
int ddir, sectors, bucket;
|
||||
|
||||
ddir = rq_data_dir(rq);
|
||||
sectors = blk_rq_stats_sectors(rq);
|
||||
|
||||
bucket = ddir + 2 * ilog2(sectors);
|
||||
|
||||
if (bucket < 0)
|
||||
return -1;
|
||||
else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
|
||||
return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
|
||||
|
||||
return bucket;
|
||||
}
|
||||
|
||||
#define BLK_QC_T_SHIFT 16
|
||||
#define BLK_QC_T_INTERNAL (1U << 31)
|
||||
|
||||
static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
|
||||
blk_qc_t qc)
|
||||
{
|
||||
return xa_load(&q->hctx_table,
|
||||
(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
|
||||
}
|
||||
|
||||
static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
|
||||
blk_qc_t qc)
|
||||
{
|
||||
unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
|
||||
|
||||
if (qc & BLK_QC_T_INTERNAL)
|
||||
return blk_mq_tag_to_rq(hctx->sched_tags, tag);
|
||||
return blk_mq_tag_to_rq(hctx->tags, tag);
|
||||
return xa_load(&q->hctx_table, qc);
|
||||
}
|
||||
|
||||
static inline blk_qc_t blk_rq_to_qc(struct request *rq)
|
||||
{
|
||||
return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
|
||||
(rq->tag != -1 ?
|
||||
rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
|
||||
return rq->mq_hctx->queue_num;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1038,10 +1002,8 @@ static inline void blk_account_io_start(struct request *req)
|
||||
|
||||
static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
|
||||
{
|
||||
if (rq->rq_flags & RQF_STATS) {
|
||||
blk_mq_poll_stats_start(rq->q);
|
||||
if (rq->rq_flags & RQF_STATS)
|
||||
blk_stat_add(rq, now);
|
||||
}
|
||||
|
||||
blk_mq_sched_completed_request(rq, now);
|
||||
blk_account_io_done(rq, now);
|
||||
@ -4222,14 +4184,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
||||
/* mark the queue as mq asap */
|
||||
q->mq_ops = set->ops;
|
||||
|
||||
q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
|
||||
blk_mq_poll_stats_bkt,
|
||||
BLK_MQ_POLL_STATS_BKTS, q);
|
||||
if (!q->poll_cb)
|
||||
goto err_exit;
|
||||
|
||||
if (blk_mq_alloc_ctxs(q))
|
||||
goto err_poll;
|
||||
goto err_exit;
|
||||
|
||||
/* init q->mq_kobj and sw queues' kobjects */
|
||||
blk_mq_sysfs_init(q);
|
||||
@ -4257,11 +4213,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
||||
|
||||
q->nr_requests = set->queue_depth;
|
||||
|
||||
/*
|
||||
* Default to classic polling
|
||||
*/
|
||||
q->poll_nsec = BLK_MQ_POLL_CLASSIC;
|
||||
|
||||
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
|
||||
blk_mq_add_queue_tag_set(set, q);
|
||||
blk_mq_map_swqueue(q);
|
||||
@ -4269,9 +4220,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
||||
|
||||
err_hctxs:
|
||||
blk_mq_release(q);
|
||||
err_poll:
|
||||
blk_stat_free_callback(q->poll_cb);
|
||||
q->poll_cb = NULL;
|
||||
err_exit:
|
||||
q->mq_ops = NULL;
|
||||
return -ENOMEM;
|
||||
@ -4768,138 +4716,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
|
||||
|
||||
/* Enable polling stats and return whether they were already enabled. */
|
||||
static bool blk_poll_stats_enable(struct request_queue *q)
|
||||
{
|
||||
if (q->poll_stat)
|
||||
return true;
|
||||
|
||||
return blk_stats_alloc_enable(q);
|
||||
}
|
||||
|
||||
static void blk_mq_poll_stats_start(struct request_queue *q)
|
||||
{
|
||||
/*
|
||||
* We don't arm the callback if polling stats are not enabled or the
|
||||
* callback is already active.
|
||||
*/
|
||||
if (!q->poll_stat || blk_stat_is_active(q->poll_cb))
|
||||
return;
|
||||
|
||||
blk_stat_activate_msecs(q->poll_cb, 100);
|
||||
}
|
||||
|
||||
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
|
||||
{
|
||||
struct request_queue *q = cb->data;
|
||||
int bucket;
|
||||
|
||||
for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
|
||||
if (cb->stat[bucket].nr_samples)
|
||||
q->poll_stat[bucket] = cb->stat[bucket];
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
|
||||
struct request *rq)
|
||||
{
|
||||
unsigned long ret = 0;
|
||||
int bucket;
|
||||
|
||||
/*
|
||||
* If stats collection isn't on, don't sleep but turn it on for
|
||||
* future users
|
||||
*/
|
||||
if (!blk_poll_stats_enable(q))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* As an optimistic guess, use half of the mean service time
|
||||
* for this type of request. We can (and should) make this smarter.
|
||||
* For instance, if the completion latencies are tight, we can
|
||||
* get closer than just half the mean. This is especially
|
||||
* important on devices where the completion latencies are longer
|
||||
* than ~10 usec. We do use the stats for the relevant IO size
|
||||
* if available which does lead to better estimates.
|
||||
*/
|
||||
bucket = blk_mq_poll_stats_bkt(rq);
|
||||
if (bucket < 0)
|
||||
return ret;
|
||||
|
||||
if (q->poll_stat[bucket].nr_samples)
|
||||
ret = (q->poll_stat[bucket].mean + 1) / 2;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
|
||||
struct request *rq = blk_qc_to_rq(hctx, qc);
|
||||
struct hrtimer_sleeper hs;
|
||||
enum hrtimer_mode mode;
|
||||
unsigned int nsecs;
|
||||
ktime_t kt;
|
||||
|
||||
/*
|
||||
* If a request has completed on queue that uses an I/O scheduler, we
|
||||
* won't get back a request from blk_qc_to_rq.
|
||||
*/
|
||||
if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* If we get here, hybrid polling is enabled. Hence poll_nsec can be:
|
||||
*
|
||||
* 0: use half of prev avg
|
||||
* >0: use this specific value
|
||||
*/
|
||||
if (q->poll_nsec > 0)
|
||||
nsecs = q->poll_nsec;
|
||||
else
|
||||
nsecs = blk_mq_poll_nsecs(q, rq);
|
||||
|
||||
if (!nsecs)
|
||||
return false;
|
||||
|
||||
rq->rq_flags |= RQF_MQ_POLL_SLEPT;
|
||||
|
||||
/*
|
||||
* This will be replaced with the stats tracking code, using
|
||||
* 'avg_completion_time / 2' as the pre-sleep target.
|
||||
*/
|
||||
kt = nsecs;
|
||||
|
||||
mode = HRTIMER_MODE_REL;
|
||||
hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
|
||||
hrtimer_set_expires(&hs.timer, kt);
|
||||
|
||||
do {
|
||||
if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
|
||||
break;
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
hrtimer_sleeper_start_expires(&hs, mode);
|
||||
if (hs.task)
|
||||
io_schedule();
|
||||
hrtimer_cancel(&hs.timer);
|
||||
mode = HRTIMER_MODE_ABS;
|
||||
} while (hs.task && !signal_pending(current));
|
||||
|
||||
__set_current_state(TASK_RUNNING);
|
||||
destroy_hrtimer_on_stack(&hs.timer);
|
||||
|
||||
/*
|
||||
* If we sleep, have the caller restart the poll loop to reset the
|
||||
* state. Like for the other success return cases, the caller is
|
||||
* responsible for checking if the IO completed. If the IO isn't
|
||||
* complete, we'll get called again and will go straight to the busy
|
||||
* poll loop.
|
||||
*/
|
||||
return true;
|
||||
}
|
||||
|
||||
static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
|
||||
struct io_comp_batch *iob, unsigned int flags)
|
||||
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
|
||||
unsigned int flags)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
|
||||
long state = get_current_state();
|
||||
@ -4926,17 +4744,6 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
|
||||
unsigned int flags)
|
||||
{
|
||||
if (!(flags & BLK_POLL_NOSLEEP) &&
|
||||
q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
|
||||
if (blk_mq_poll_hybrid(q, cookie))
|
||||
return 1;
|
||||
}
|
||||
return blk_mq_poll_classic(q, cookie, iob, flags);
|
||||
}
|
||||
|
||||
unsigned int blk_mq_rq_cpu(struct request *rq)
|
||||
{
|
||||
return rq->mq_ctx->cpu;
|
||||
|
@ -231,21 +231,3 @@ void blk_free_queue_stats(struct blk_queue_stats *stats)
|
||||
|
||||
kfree(stats);
|
||||
}
|
||||
|
||||
bool blk_stats_alloc_enable(struct request_queue *q)
|
||||
{
|
||||
struct blk_rq_stat *poll_stat;
|
||||
|
||||
poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat),
|
||||
GFP_ATOMIC);
|
||||
if (!poll_stat)
|
||||
return false;
|
||||
|
||||
if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) {
|
||||
kfree(poll_stat);
|
||||
return true;
|
||||
}
|
||||
|
||||
blk_stat_add_callback(q, q->poll_cb);
|
||||
return false;
|
||||
}
|
||||
|
@ -408,35 +408,12 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
|
||||
|
||||
static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
|
||||
{
|
||||
int val;
|
||||
|
||||
if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
|
||||
val = BLK_MQ_POLL_CLASSIC;
|
||||
else
|
||||
val = q->poll_nsec / 1000;
|
||||
|
||||
return sprintf(page, "%d\n", val);
|
||||
return sprintf(page, "%d\n", -1);
|
||||
}
|
||||
|
||||
static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
|
||||
size_t count)
|
||||
{
|
||||
int err, val;
|
||||
|
||||
if (!q->mq_ops || !q->mq_ops->poll)
|
||||
return -EINVAL;
|
||||
|
||||
err = kstrtoint(page, 10, &val);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (val == BLK_MQ_POLL_CLASSIC)
|
||||
q->poll_nsec = BLK_MQ_POLL_CLASSIC;
|
||||
else if (val >= 0)
|
||||
q->poll_nsec = val * 1000;
|
||||
else
|
||||
return -EINVAL;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
|
@ -57,8 +57,6 @@ typedef __u32 __bitwise req_flags_t;
|
||||
#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
|
||||
/* The per-zone write lock is held for this request */
|
||||
#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
|
||||
/* already slept for hybrid poll */
|
||||
#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20))
|
||||
/* ->timeout has been called, don't expire again */
|
||||
#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
|
||||
/* queue has elevator attached */
|
||||
|
@ -44,12 +44,6 @@ extern const struct device_type disk_type;
|
||||
extern struct device_type part_type;
|
||||
extern struct class block_class;
|
||||
|
||||
/* Must be consistent with blk_mq_poll_stats_bkt() */
|
||||
#define BLK_MQ_POLL_STATS_BKTS 16
|
||||
|
||||
/* Doing classic polling */
|
||||
#define BLK_MQ_POLL_CLASSIC -1
|
||||
|
||||
/*
|
||||
* Maximum number of blkcg policies allowed to be registered concurrently.
|
||||
* Defined here to simplify include dependency.
|
||||
@ -468,10 +462,6 @@ struct request_queue {
|
||||
#endif
|
||||
|
||||
unsigned int rq_timeout;
|
||||
int poll_nsec;
|
||||
|
||||
struct blk_stat_callback *poll_cb;
|
||||
struct blk_rq_stat *poll_stat;
|
||||
|
||||
struct timer_list timeout;
|
||||
struct work_struct timeout_work;
|
||||
@ -870,8 +860,6 @@ blk_status_t errno_to_blk_status(int errno);
|
||||
|
||||
/* only poll the hardware once, don't continue until a completion was found */
|
||||
#define BLK_POLL_ONESHOT (1 << 0)
|
||||
/* do not sleep to wait for the expected completion time */
|
||||
#define BLK_POLL_NOSLEEP (1 << 1)
|
||||
int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
|
||||
int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
|
||||
unsigned int flags);
|
||||
|
@ -1002,7 +1002,7 @@ void io_rw_fail(struct io_kiocb *req)
|
||||
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
|
||||
{
|
||||
struct io_wq_work_node *pos, *start, *prev;
|
||||
unsigned int poll_flags = BLK_POLL_NOSLEEP;
|
||||
unsigned int poll_flags = 0;
|
||||
DEFINE_IO_COMP_BATCH(iob);
|
||||
int nr_events = 0;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user