forked from Minki/linux
b49773e7bc
Simultaneously writing to a sequential zone of a zoned block device from multiple contexts requires mutual exclusion for BIO issuing to ensure that writes happen sequentially. However, even for a well behaved user correctly implementing such synchronization, BIO plugging may interfere and result in BIOs from the different contextx to be reordered if plugging is done outside of the mutual exclusion section, e.g. the plug was started by a function higher in the call chain than the function issuing BIOs. Context A Context B | blk_start_plug() | ... | seq_write_zone() | mutex_lock(zone) | bio-0->bi_iter.bi_sector = zone->wp | zone->wp += bio_sectors(bio-0) | submit_bio(bio-0) | bio-1->bi_iter.bi_sector = zone->wp | zone->wp += bio_sectors(bio-1) | submit_bio(bio-1) | mutex_unlock(zone) | return | -----------------------> | seq_write_zone() | mutex_lock(zone) | bio-2->bi_iter.bi_sector = zone->wp | zone->wp += bio_sectors(bio-2) | submit_bio(bio-2) | mutex_unlock(zone) | <------------------------- | | blk_finish_plug() In the above example, despite the mutex synchronization ensuring the correct BIO issuing order 0, 1, 2, context A BIOs 0 and 1 end up being issued after BIO 2 of context B, when the plug is released with blk_finish_plug(). While this problem can be addressed using the blk_flush_plug_list() function (in the above example, the call must be inserted before the zone mutex lock is released), a simple generic solution in the block layer avoid this additional code in all zoned block device user code. The simple generic solution implemented with this patch is to introduce the internal helper function blk_mq_plug() to access the current context plug on BIO submission. This helper returns the current plug only if the target device is not a zoned block device or if the BIO to be plugged is not a write operation. Otherwise, the caller context plug is ignored and NULL returned, resulting is all writes to zoned block device to never be plugged. Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
1774 lines
48 KiB
C
1774 lines
48 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (C) 1991, 1992 Linus Torvalds
|
|
* Copyright (C) 1994, Karl Keyte: Added support for disk statistics
|
|
* Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
|
|
* Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
|
|
* kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
|
|
* - July2000
|
|
* bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
|
|
*/
|
|
|
|
/*
|
|
* This handles all read/write requests to block devices
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/module.h>
|
|
#include <linux/backing-dev.h>
|
|
#include <linux/bio.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/blk-mq.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/kernel_stat.h>
|
|
#include <linux/string.h>
|
|
#include <linux/init.h>
|
|
#include <linux/completion.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/writeback.h>
|
|
#include <linux/task_io_accounting_ops.h>
|
|
#include <linux/fault-inject.h>
|
|
#include <linux/list_sort.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/ratelimit.h>
|
|
#include <linux/pm_runtime.h>
|
|
#include <linux/blk-cgroup.h>
|
|
#include <linux/debugfs.h>
|
|
#include <linux/bpf.h>
|
|
|
|
#define CREATE_TRACE_POINTS
|
|
#include <trace/events/block.h>
|
|
|
|
#include "blk.h"
|
|
#include "blk-mq.h"
|
|
#include "blk-mq-sched.h"
|
|
#include "blk-pm.h"
|
|
#include "blk-rq-qos.h"
|
|
|
|
#ifdef CONFIG_DEBUG_FS
|
|
struct dentry *blk_debugfs_root;
|
|
#endif
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
|
|
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
|
|
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
|
|
EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
|
|
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
|
|
|
|
DEFINE_IDA(blk_queue_ida);
|
|
|
|
/*
|
|
* For queue allocation
|
|
*/
|
|
struct kmem_cache *blk_requestq_cachep;
|
|
|
|
/*
|
|
* Controlling structure to kblockd
|
|
*/
|
|
static struct workqueue_struct *kblockd_workqueue;
|
|
|
|
/**
|
|
* blk_queue_flag_set - atomically set a queue flag
|
|
* @flag: flag to be set
|
|
* @q: request queue
|
|
*/
|
|
void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
|
|
{
|
|
set_bit(flag, &q->queue_flags);
|
|
}
|
|
EXPORT_SYMBOL(blk_queue_flag_set);
|
|
|
|
/**
|
|
* blk_queue_flag_clear - atomically clear a queue flag
|
|
* @flag: flag to be cleared
|
|
* @q: request queue
|
|
*/
|
|
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
|
|
{
|
|
clear_bit(flag, &q->queue_flags);
|
|
}
|
|
EXPORT_SYMBOL(blk_queue_flag_clear);
|
|
|
|
/**
|
|
* blk_queue_flag_test_and_set - atomically test and set a queue flag
|
|
* @flag: flag to be set
|
|
* @q: request queue
|
|
*
|
|
* Returns the previous value of @flag - 0 if the flag was not set and 1 if
|
|
* the flag was already set.
|
|
*/
|
|
bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
|
|
{
|
|
return test_and_set_bit(flag, &q->queue_flags);
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
|
|
|
|
void blk_rq_init(struct request_queue *q, struct request *rq)
|
|
{
|
|
memset(rq, 0, sizeof(*rq));
|
|
|
|
INIT_LIST_HEAD(&rq->queuelist);
|
|
rq->q = q;
|
|
rq->__sector = (sector_t) -1;
|
|
INIT_HLIST_NODE(&rq->hash);
|
|
RB_CLEAR_NODE(&rq->rb_node);
|
|
rq->tag = -1;
|
|
rq->internal_tag = -1;
|
|
rq->start_time_ns = ktime_get_ns();
|
|
rq->part = NULL;
|
|
refcount_set(&rq->ref, 1);
|
|
}
|
|
EXPORT_SYMBOL(blk_rq_init);
|
|
|
|
#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
|
|
static const char *const blk_op_name[] = {
|
|
REQ_OP_NAME(READ),
|
|
REQ_OP_NAME(WRITE),
|
|
REQ_OP_NAME(FLUSH),
|
|
REQ_OP_NAME(DISCARD),
|
|
REQ_OP_NAME(SECURE_ERASE),
|
|
REQ_OP_NAME(ZONE_RESET),
|
|
REQ_OP_NAME(WRITE_SAME),
|
|
REQ_OP_NAME(WRITE_ZEROES),
|
|
REQ_OP_NAME(SCSI_IN),
|
|
REQ_OP_NAME(SCSI_OUT),
|
|
REQ_OP_NAME(DRV_IN),
|
|
REQ_OP_NAME(DRV_OUT),
|
|
};
|
|
#undef REQ_OP_NAME
|
|
|
|
/**
|
|
* blk_op_str - Return string XXX in the REQ_OP_XXX.
|
|
* @op: REQ_OP_XXX.
|
|
*
|
|
* Description: Centralize block layer function to convert REQ_OP_XXX into
|
|
* string format. Useful in the debugging and tracing bio or request. For
|
|
* invalid REQ_OP_XXX it returns string "UNKNOWN".
|
|
*/
|
|
inline const char *blk_op_str(unsigned int op)
|
|
{
|
|
const char *op_str = "UNKNOWN";
|
|
|
|
if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
|
|
op_str = blk_op_name[op];
|
|
|
|
return op_str;
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_op_str);
|
|
|
|
static const struct {
|
|
int errno;
|
|
const char *name;
|
|
} blk_errors[] = {
|
|
[BLK_STS_OK] = { 0, "" },
|
|
[BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" },
|
|
[BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" },
|
|
[BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" },
|
|
[BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" },
|
|
[BLK_STS_TARGET] = { -EREMOTEIO, "critical target" },
|
|
[BLK_STS_NEXUS] = { -EBADE, "critical nexus" },
|
|
[BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
|
|
[BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
|
|
[BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
|
|
[BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
|
|
[BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
|
|
|
|
/* device mapper special case, should not leak out: */
|
|
[BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
|
|
|
|
/* everything else not covered above: */
|
|
[BLK_STS_IOERR] = { -EIO, "I/O" },
|
|
};
|
|
|
|
blk_status_t errno_to_blk_status(int errno)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
|
|
if (blk_errors[i].errno == errno)
|
|
return (__force blk_status_t)i;
|
|
}
|
|
|
|
return BLK_STS_IOERR;
|
|
}
|
|
EXPORT_SYMBOL_GPL(errno_to_blk_status);
|
|
|
|
int blk_status_to_errno(blk_status_t status)
|
|
{
|
|
int idx = (__force int)status;
|
|
|
|
if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
|
|
return -EIO;
|
|
return blk_errors[idx].errno;
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_status_to_errno);
|
|
|
|
static void print_req_error(struct request *req, blk_status_t status,
|
|
const char *caller)
|
|
{
|
|
int idx = (__force int)status;
|
|
|
|
if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
|
|
return;
|
|
|
|
printk_ratelimited(KERN_ERR
|
|
"%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
|
|
"phys_seg %u prio class %u\n",
|
|
caller, blk_errors[idx].name,
|
|
req->rq_disk ? req->rq_disk->disk_name : "?",
|
|
blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
|
|
req->cmd_flags & ~REQ_OP_MASK,
|
|
req->nr_phys_segments,
|
|
IOPRIO_PRIO_CLASS(req->ioprio));
|
|
}
|
|
|
|
static void req_bio_endio(struct request *rq, struct bio *bio,
|
|
unsigned int nbytes, blk_status_t error)
|
|
{
|
|
if (error)
|
|
bio->bi_status = error;
|
|
|
|
if (unlikely(rq->rq_flags & RQF_QUIET))
|
|
bio_set_flag(bio, BIO_QUIET);
|
|
|
|
bio_advance(bio, nbytes);
|
|
|
|
/* don't actually finish bio if it's part of flush sequence */
|
|
if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
|
|
bio_endio(bio);
|
|
}
|
|
|
|
void blk_dump_rq_flags(struct request *rq, char *msg)
|
|
{
|
|
printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
|
|
rq->rq_disk ? rq->rq_disk->disk_name : "?",
|
|
(unsigned long long) rq->cmd_flags);
|
|
|
|
printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
|
|
(unsigned long long)blk_rq_pos(rq),
|
|
blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
|
|
printk(KERN_INFO " bio %p, biotail %p, len %u\n",
|
|
rq->bio, rq->biotail, blk_rq_bytes(rq));
|
|
}
|
|
EXPORT_SYMBOL(blk_dump_rq_flags);
|
|
|
|
/**
|
|
* blk_sync_queue - cancel any pending callbacks on a queue
|
|
* @q: the queue
|
|
*
|
|
* Description:
|
|
* The block layer may perform asynchronous callback activity
|
|
* on a queue, such as calling the unplug function after a timeout.
|
|
* A block device may call blk_sync_queue to ensure that any
|
|
* such activity is cancelled, thus allowing it to release resources
|
|
* that the callbacks might use. The caller must already have made sure
|
|
* that its ->make_request_fn will not re-add plugging prior to calling
|
|
* this function.
|
|
*
|
|
* This function does not cancel any asynchronous activity arising
|
|
* out of elevator or throttling code. That would require elevator_exit()
|
|
* and blkcg_exit_queue() to be called with queue lock initialized.
|
|
*
|
|
*/
|
|
void blk_sync_queue(struct request_queue *q)
|
|
{
|
|
del_timer_sync(&q->timeout);
|
|
cancel_work_sync(&q->timeout_work);
|
|
}
|
|
EXPORT_SYMBOL(blk_sync_queue);
|
|
|
|
/**
|
|
* blk_set_pm_only - increment pm_only counter
|
|
* @q: request queue pointer
|
|
*/
|
|
void blk_set_pm_only(struct request_queue *q)
|
|
{
|
|
atomic_inc(&q->pm_only);
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_set_pm_only);
|
|
|
|
void blk_clear_pm_only(struct request_queue *q)
|
|
{
|
|
int pm_only;
|
|
|
|
pm_only = atomic_dec_return(&q->pm_only);
|
|
WARN_ON_ONCE(pm_only < 0);
|
|
if (pm_only == 0)
|
|
wake_up_all(&q->mq_freeze_wq);
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_clear_pm_only);
|
|
|
|
void blk_put_queue(struct request_queue *q)
|
|
{
|
|
kobject_put(&q->kobj);
|
|
}
|
|
EXPORT_SYMBOL(blk_put_queue);
|
|
|
|
void blk_set_queue_dying(struct request_queue *q)
|
|
{
|
|
blk_queue_flag_set(QUEUE_FLAG_DYING, q);
|
|
|
|
/*
|
|
* When queue DYING flag is set, we need to block new req
|
|
* entering queue, so we call blk_freeze_queue_start() to
|
|
* prevent I/O from crossing blk_queue_enter().
|
|
*/
|
|
blk_freeze_queue_start(q);
|
|
|
|
if (queue_is_mq(q))
|
|
blk_mq_wake_waiters(q);
|
|
|
|
/* Make blk_queue_enter() reexamine the DYING flag. */
|
|
wake_up_all(&q->mq_freeze_wq);
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_set_queue_dying);
|
|
|
|
/**
|
|
* blk_cleanup_queue - shutdown a request queue
|
|
* @q: request queue to shutdown
|
|
*
|
|
* Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
|
|
* put it. All future requests will be failed immediately with -ENODEV.
|
|
*/
|
|
void blk_cleanup_queue(struct request_queue *q)
|
|
{
|
|
/* mark @q DYING, no new request or merges will be allowed afterwards */
|
|
mutex_lock(&q->sysfs_lock);
|
|
blk_set_queue_dying(q);
|
|
|
|
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
|
|
blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
|
|
blk_queue_flag_set(QUEUE_FLAG_DYING, q);
|
|
mutex_unlock(&q->sysfs_lock);
|
|
|
|
/*
|
|
* Drain all requests queued before DYING marking. Set DEAD flag to
|
|
* prevent that q->request_fn() gets invoked after draining finished.
|
|
*/
|
|
blk_freeze_queue(q);
|
|
|
|
rq_qos_exit(q);
|
|
|
|
blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
|
|
|
|
/* for synchronous bio-based driver finish in-flight integrity i/o */
|
|
blk_flush_integrity();
|
|
|
|
/* @q won't process any more request, flush async actions */
|
|
del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
|
|
blk_sync_queue(q);
|
|
|
|
if (queue_is_mq(q))
|
|
blk_mq_exit_queue(q);
|
|
|
|
/*
|
|
* In theory, request pool of sched_tags belongs to request queue.
|
|
* However, the current implementation requires tag_set for freeing
|
|
* requests, so free the pool now.
|
|
*
|
|
* Queue has become frozen, there can't be any in-queue requests, so
|
|
* it is safe to free requests now.
|
|
*/
|
|
mutex_lock(&q->sysfs_lock);
|
|
if (q->elevator)
|
|
blk_mq_sched_free_requests(q);
|
|
mutex_unlock(&q->sysfs_lock);
|
|
|
|
percpu_ref_exit(&q->q_usage_counter);
|
|
|
|
/* @q is and will stay empty, shutdown and put */
|
|
blk_put_queue(q);
|
|
}
|
|
EXPORT_SYMBOL(blk_cleanup_queue);
|
|
|
|
struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
|
|
{
|
|
return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
|
|
}
|
|
EXPORT_SYMBOL(blk_alloc_queue);
|
|
|
|
/**
|
|
* blk_queue_enter() - try to increase q->q_usage_counter
|
|
* @q: request queue pointer
|
|
* @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
|
|
*/
|
|
int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
|
|
{
|
|
const bool pm = flags & BLK_MQ_REQ_PREEMPT;
|
|
|
|
while (true) {
|
|
bool success = false;
|
|
|
|
rcu_read_lock();
|
|
if (percpu_ref_tryget_live(&q->q_usage_counter)) {
|
|
/*
|
|
* The code that increments the pm_only counter is
|
|
* responsible for ensuring that that counter is
|
|
* globally visible before the queue is unfrozen.
|
|
*/
|
|
if (pm || !blk_queue_pm_only(q)) {
|
|
success = true;
|
|
} else {
|
|
percpu_ref_put(&q->q_usage_counter);
|
|
}
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
if (success)
|
|
return 0;
|
|
|
|
if (flags & BLK_MQ_REQ_NOWAIT)
|
|
return -EBUSY;
|
|
|
|
/*
|
|
* read pair of barrier in blk_freeze_queue_start(),
|
|
* we need to order reading __PERCPU_REF_DEAD flag of
|
|
* .q_usage_counter and reading .mq_freeze_depth or
|
|
* queue dying flag, otherwise the following wait may
|
|
* never return if the two reads are reordered.
|
|
*/
|
|
smp_rmb();
|
|
|
|
wait_event(q->mq_freeze_wq,
|
|
(!q->mq_freeze_depth &&
|
|
(pm || (blk_pm_request_resume(q),
|
|
!blk_queue_pm_only(q)))) ||
|
|
blk_queue_dying(q));
|
|
if (blk_queue_dying(q))
|
|
return -ENODEV;
|
|
}
|
|
}
|
|
|
|
void blk_queue_exit(struct request_queue *q)
|
|
{
|
|
percpu_ref_put(&q->q_usage_counter);
|
|
}
|
|
|
|
static void blk_queue_usage_counter_release(struct percpu_ref *ref)
|
|
{
|
|
struct request_queue *q =
|
|
container_of(ref, struct request_queue, q_usage_counter);
|
|
|
|
wake_up_all(&q->mq_freeze_wq);
|
|
}
|
|
|
|
static void blk_rq_timed_out_timer(struct timer_list *t)
|
|
{
|
|
struct request_queue *q = from_timer(q, t, timeout);
|
|
|
|
kblockd_schedule_work(&q->timeout_work);
|
|
}
|
|
|
|
static void blk_timeout_work(struct work_struct *work)
|
|
{
|
|
}
|
|
|
|
/**
|
|
* blk_alloc_queue_node - allocate a request queue
|
|
* @gfp_mask: memory allocation flags
|
|
* @node_id: NUMA node to allocate memory from
|
|
*/
|
|
struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
|
|
{
|
|
struct request_queue *q;
|
|
int ret;
|
|
|
|
q = kmem_cache_alloc_node(blk_requestq_cachep,
|
|
gfp_mask | __GFP_ZERO, node_id);
|
|
if (!q)
|
|
return NULL;
|
|
|
|
INIT_LIST_HEAD(&q->queue_head);
|
|
q->last_merge = NULL;
|
|
|
|
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
|
|
if (q->id < 0)
|
|
goto fail_q;
|
|
|
|
ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
|
|
if (ret)
|
|
goto fail_id;
|
|
|
|
q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
|
|
if (!q->backing_dev_info)
|
|
goto fail_split;
|
|
|
|
q->stats = blk_alloc_queue_stats();
|
|
if (!q->stats)
|
|
goto fail_stats;
|
|
|
|
q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
|
|
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
|
|
q->backing_dev_info->name = "block";
|
|
q->node = node_id;
|
|
|
|
timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
|
|
laptop_mode_timer_fn, 0);
|
|
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
|
|
INIT_WORK(&q->timeout_work, blk_timeout_work);
|
|
INIT_LIST_HEAD(&q->icq_list);
|
|
#ifdef CONFIG_BLK_CGROUP
|
|
INIT_LIST_HEAD(&q->blkg_list);
|
|
#endif
|
|
|
|
kobject_init(&q->kobj, &blk_queue_ktype);
|
|
|
|
#ifdef CONFIG_BLK_DEV_IO_TRACE
|
|
mutex_init(&q->blk_trace_mutex);
|
|
#endif
|
|
mutex_init(&q->sysfs_lock);
|
|
spin_lock_init(&q->queue_lock);
|
|
|
|
init_waitqueue_head(&q->mq_freeze_wq);
|
|
mutex_init(&q->mq_freeze_lock);
|
|
|
|
/*
|
|
* Init percpu_ref in atomic mode so that it's faster to shutdown.
|
|
* See blk_register_queue() for details.
|
|
*/
|
|
if (percpu_ref_init(&q->q_usage_counter,
|
|
blk_queue_usage_counter_release,
|
|
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
|
|
goto fail_bdi;
|
|
|
|
if (blkcg_init_queue(q))
|
|
goto fail_ref;
|
|
|
|
return q;
|
|
|
|
fail_ref:
|
|
percpu_ref_exit(&q->q_usage_counter);
|
|
fail_bdi:
|
|
blk_free_queue_stats(q->stats);
|
|
fail_stats:
|
|
bdi_put(q->backing_dev_info);
|
|
fail_split:
|
|
bioset_exit(&q->bio_split);
|
|
fail_id:
|
|
ida_simple_remove(&blk_queue_ida, q->id);
|
|
fail_q:
|
|
kmem_cache_free(blk_requestq_cachep, q);
|
|
return NULL;
|
|
}
|
|
EXPORT_SYMBOL(blk_alloc_queue_node);
|
|
|
|
bool blk_get_queue(struct request_queue *q)
|
|
{
|
|
if (likely(!blk_queue_dying(q))) {
|
|
__blk_get_queue(q);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
EXPORT_SYMBOL(blk_get_queue);
|
|
|
|
/**
|
|
* blk_get_request - allocate a request
|
|
* @q: request queue to allocate a request for
|
|
* @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
|
|
* @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
|
|
*/
|
|
struct request *blk_get_request(struct request_queue *q, unsigned int op,
|
|
blk_mq_req_flags_t flags)
|
|
{
|
|
struct request *req;
|
|
|
|
WARN_ON_ONCE(op & REQ_NOWAIT);
|
|
WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
|
|
|
|
req = blk_mq_alloc_request(q, op, flags);
|
|
if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
|
|
q->mq_ops->initialize_rq_fn(req);
|
|
|
|
return req;
|
|
}
|
|
EXPORT_SYMBOL(blk_get_request);
|
|
|
|
void blk_put_request(struct request *req)
|
|
{
|
|
blk_mq_free_request(req);
|
|
}
|
|
EXPORT_SYMBOL(blk_put_request);
|
|
|
|
bool bio_attempt_back_merge(struct request *req, struct bio *bio,
|
|
unsigned int nr_segs)
|
|
{
|
|
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
|
|
|
|
if (!ll_back_merge_fn(req, bio, nr_segs))
|
|
return false;
|
|
|
|
trace_block_bio_backmerge(req->q, req, bio);
|
|
|
|
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
|
|
blk_rq_set_mixed_merge(req);
|
|
|
|
req->biotail->bi_next = bio;
|
|
req->biotail = bio;
|
|
req->__data_len += bio->bi_iter.bi_size;
|
|
|
|
blk_account_io_start(req, false);
|
|
return true;
|
|
}
|
|
|
|
bool bio_attempt_front_merge(struct request *req, struct bio *bio,
|
|
unsigned int nr_segs)
|
|
{
|
|
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
|
|
|
|
if (!ll_front_merge_fn(req, bio, nr_segs))
|
|
return false;
|
|
|
|
trace_block_bio_frontmerge(req->q, req, bio);
|
|
|
|
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
|
|
blk_rq_set_mixed_merge(req);
|
|
|
|
bio->bi_next = req->bio;
|
|
req->bio = bio;
|
|
|
|
req->__sector = bio->bi_iter.bi_sector;
|
|
req->__data_len += bio->bi_iter.bi_size;
|
|
|
|
blk_account_io_start(req, false);
|
|
return true;
|
|
}
|
|
|
|
bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
|
|
struct bio *bio)
|
|
{
|
|
unsigned short segments = blk_rq_nr_discard_segments(req);
|
|
|
|
if (segments >= queue_max_discard_segments(q))
|
|
goto no_merge;
|
|
if (blk_rq_sectors(req) + bio_sectors(bio) >
|
|
blk_rq_get_max_sectors(req, blk_rq_pos(req)))
|
|
goto no_merge;
|
|
|
|
req->biotail->bi_next = bio;
|
|
req->biotail = bio;
|
|
req->__data_len += bio->bi_iter.bi_size;
|
|
req->nr_phys_segments = segments + 1;
|
|
|
|
blk_account_io_start(req, false);
|
|
return true;
|
|
no_merge:
|
|
req_set_nomerge(q, req);
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* blk_attempt_plug_merge - try to merge with %current's plugged list
|
|
* @q: request_queue new bio is being queued at
|
|
* @bio: new bio being queued
|
|
* @nr_segs: number of segments in @bio
|
|
* @same_queue_rq: pointer to &struct request that gets filled in when
|
|
* another request associated with @q is found on the plug list
|
|
* (optional, may be %NULL)
|
|
*
|
|
* Determine whether @bio being queued on @q can be merged with a request
|
|
* on %current's plugged list. Returns %true if merge was successful,
|
|
* otherwise %false.
|
|
*
|
|
* Plugging coalesces IOs from the same issuer for the same purpose without
|
|
* going through @q->queue_lock. As such it's more of an issuing mechanism
|
|
* than scheduling, and the request, while may have elvpriv data, is not
|
|
* added on the elevator at this point. In addition, we don't have
|
|
* reliable access to the elevator outside queue lock. Only check basic
|
|
* merging parameters without querying the elevator.
|
|
*
|
|
* Caller must ensure !blk_queue_nomerges(q) beforehand.
|
|
*/
|
|
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
|
|
unsigned int nr_segs, struct request **same_queue_rq)
|
|
{
|
|
struct blk_plug *plug;
|
|
struct request *rq;
|
|
struct list_head *plug_list;
|
|
|
|
plug = blk_mq_plug(q, bio);
|
|
if (!plug)
|
|
return false;
|
|
|
|
plug_list = &plug->mq_list;
|
|
|
|
list_for_each_entry_reverse(rq, plug_list, queuelist) {
|
|
bool merged = false;
|
|
|
|
if (rq->q == q && same_queue_rq) {
|
|
/*
|
|
* Only blk-mq multiple hardware queues case checks the
|
|
* rq in the same queue, there should be only one such
|
|
* rq in a queue
|
|
**/
|
|
*same_queue_rq = rq;
|
|
}
|
|
|
|
if (rq->q != q || !blk_rq_merge_ok(rq, bio))
|
|
continue;
|
|
|
|
switch (blk_try_merge(rq, bio)) {
|
|
case ELEVATOR_BACK_MERGE:
|
|
merged = bio_attempt_back_merge(rq, bio, nr_segs);
|
|
break;
|
|
case ELEVATOR_FRONT_MERGE:
|
|
merged = bio_attempt_front_merge(rq, bio, nr_segs);
|
|
break;
|
|
case ELEVATOR_DISCARD_MERGE:
|
|
merged = bio_attempt_discard_merge(q, rq, bio);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (merged)
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static void handle_bad_sector(struct bio *bio, sector_t maxsector)
|
|
{
|
|
char b[BDEVNAME_SIZE];
|
|
|
|
printk(KERN_INFO "attempt to access beyond end of device\n");
|
|
printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
|
|
bio_devname(bio, b), bio->bi_opf,
|
|
(unsigned long long)bio_end_sector(bio),
|
|
(long long)maxsector);
|
|
}
|
|
|
|
#ifdef CONFIG_FAIL_MAKE_REQUEST
|
|
|
|
static DECLARE_FAULT_ATTR(fail_make_request);
|
|
|
|
static int __init setup_fail_make_request(char *str)
|
|
{
|
|
return setup_fault_attr(&fail_make_request, str);
|
|
}
|
|
__setup("fail_make_request=", setup_fail_make_request);
|
|
|
|
static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
|
|
{
|
|
return part->make_it_fail && should_fail(&fail_make_request, bytes);
|
|
}
|
|
|
|
static int __init fail_make_request_debugfs(void)
|
|
{
|
|
struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
|
|
NULL, &fail_make_request);
|
|
|
|
return PTR_ERR_OR_ZERO(dir);
|
|
}
|
|
|
|
late_initcall(fail_make_request_debugfs);
|
|
|
|
#else /* CONFIG_FAIL_MAKE_REQUEST */
|
|
|
|
static inline bool should_fail_request(struct hd_struct *part,
|
|
unsigned int bytes)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
#endif /* CONFIG_FAIL_MAKE_REQUEST */
|
|
|
|
static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
|
|
{
|
|
const int op = bio_op(bio);
|
|
|
|
if (part->policy && op_is_write(op)) {
|
|
char b[BDEVNAME_SIZE];
|
|
|
|
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
|
|
return false;
|
|
|
|
WARN_ONCE(1,
|
|
"generic_make_request: Trying to write "
|
|
"to read-only block-device %s (partno %d)\n",
|
|
bio_devname(bio, b), part->partno);
|
|
/* Older lvm-tools actually trigger this */
|
|
return false;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static noinline int should_fail_bio(struct bio *bio)
|
|
{
|
|
if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
|
|
return -EIO;
|
|
return 0;
|
|
}
|
|
ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
|
|
|
|
/*
|
|
* Check whether this bio extends beyond the end of the device or partition.
|
|
* This may well happen - the kernel calls bread() without checking the size of
|
|
* the device, e.g., when mounting a file system.
|
|
*/
|
|
static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
|
|
{
|
|
unsigned int nr_sectors = bio_sectors(bio);
|
|
|
|
if (nr_sectors && maxsector &&
|
|
(nr_sectors > maxsector ||
|
|
bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
|
|
handle_bad_sector(bio, maxsector);
|
|
return -EIO;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Remap block n of partition p to block n+start(p) of the disk.
|
|
*/
|
|
static inline int blk_partition_remap(struct bio *bio)
|
|
{
|
|
struct hd_struct *p;
|
|
int ret = -EIO;
|
|
|
|
rcu_read_lock();
|
|
p = __disk_get_part(bio->bi_disk, bio->bi_partno);
|
|
if (unlikely(!p))
|
|
goto out;
|
|
if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
|
|
goto out;
|
|
if (unlikely(bio_check_ro(bio, p)))
|
|
goto out;
|
|
|
|
/*
|
|
* Zone reset does not include bi_size so bio_sectors() is always 0.
|
|
* Include a test for the reset op code and perform the remap if needed.
|
|
*/
|
|
if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) {
|
|
if (bio_check_eod(bio, part_nr_sects_read(p)))
|
|
goto out;
|
|
bio->bi_iter.bi_sector += p->start_sect;
|
|
trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
|
|
bio->bi_iter.bi_sector - p->start_sect);
|
|
}
|
|
bio->bi_partno = 0;
|
|
ret = 0;
|
|
out:
|
|
rcu_read_unlock();
|
|
return ret;
|
|
}
|
|
|
|
static noinline_for_stack bool
|
|
generic_make_request_checks(struct bio *bio)
|
|
{
|
|
struct request_queue *q;
|
|
int nr_sectors = bio_sectors(bio);
|
|
blk_status_t status = BLK_STS_IOERR;
|
|
char b[BDEVNAME_SIZE];
|
|
|
|
might_sleep();
|
|
|
|
q = bio->bi_disk->queue;
|
|
if (unlikely(!q)) {
|
|
printk(KERN_ERR
|
|
"generic_make_request: Trying to access "
|
|
"nonexistent block-device %s (%Lu)\n",
|
|
bio_devname(bio, b), (long long)bio->bi_iter.bi_sector);
|
|
goto end_io;
|
|
}
|
|
|
|
/*
|
|
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
|
|
* if queue is not a request based queue.
|
|
*/
|
|
if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q))
|
|
goto not_supported;
|
|
|
|
if (should_fail_bio(bio))
|
|
goto end_io;
|
|
|
|
if (bio->bi_partno) {
|
|
if (unlikely(blk_partition_remap(bio)))
|
|
goto end_io;
|
|
} else {
|
|
if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
|
|
goto end_io;
|
|
if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
|
|
goto end_io;
|
|
}
|
|
|
|
/*
|
|
* Filter flush bio's early so that make_request based
|
|
* drivers without flush support don't have to worry
|
|
* about them.
|
|
*/
|
|
if (op_is_flush(bio->bi_opf) &&
|
|
!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
|
|
bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
|
|
if (!nr_sectors) {
|
|
status = BLK_STS_OK;
|
|
goto end_io;
|
|
}
|
|
}
|
|
|
|
if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
|
|
bio->bi_opf &= ~REQ_HIPRI;
|
|
|
|
switch (bio_op(bio)) {
|
|
case REQ_OP_DISCARD:
|
|
if (!blk_queue_discard(q))
|
|
goto not_supported;
|
|
break;
|
|
case REQ_OP_SECURE_ERASE:
|
|
if (!blk_queue_secure_erase(q))
|
|
goto not_supported;
|
|
break;
|
|
case REQ_OP_WRITE_SAME:
|
|
if (!q->limits.max_write_same_sectors)
|
|
goto not_supported;
|
|
break;
|
|
case REQ_OP_ZONE_RESET:
|
|
if (!blk_queue_is_zoned(q))
|
|
goto not_supported;
|
|
break;
|
|
case REQ_OP_WRITE_ZEROES:
|
|
if (!q->limits.max_write_zeroes_sectors)
|
|
goto not_supported;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* Various block parts want %current->io_context and lazy ioc
|
|
* allocation ends up trading a lot of pain for a small amount of
|
|
* memory. Just allocate it upfront. This may fail and block
|
|
* layer knows how to live with it.
|
|
*/
|
|
create_io_context(GFP_ATOMIC, q->node);
|
|
|
|
if (!blkcg_bio_issue_check(q, bio))
|
|
return false;
|
|
|
|
if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
|
|
trace_block_bio_queue(q, bio);
|
|
/* Now that enqueuing has been traced, we need to trace
|
|
* completion as well.
|
|
*/
|
|
bio_set_flag(bio, BIO_TRACE_COMPLETION);
|
|
}
|
|
return true;
|
|
|
|
not_supported:
|
|
status = BLK_STS_NOTSUPP;
|
|
end_io:
|
|
bio->bi_status = status;
|
|
bio_endio(bio);
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* generic_make_request - hand a buffer to its device driver for I/O
|
|
* @bio: The bio describing the location in memory and on the device.
|
|
*
|
|
* generic_make_request() is used to make I/O requests of block
|
|
* devices. It is passed a &struct bio, which describes the I/O that needs
|
|
* to be done.
|
|
*
|
|
* generic_make_request() does not return any status. The
|
|
* success/failure status of the request, along with notification of
|
|
* completion, is delivered asynchronously through the bio->bi_end_io
|
|
* function described (one day) else where.
|
|
*
|
|
* The caller of generic_make_request must make sure that bi_io_vec
|
|
* are set to describe the memory buffer, and that bi_dev and bi_sector are
|
|
* set to describe the device address, and the
|
|
* bi_end_io and optionally bi_private are set to describe how
|
|
* completion notification should be signaled.
|
|
*
|
|
* generic_make_request and the drivers it calls may use bi_next if this
|
|
* bio happens to be merged with someone else, and may resubmit the bio to
|
|
* a lower device by calling into generic_make_request recursively, which
|
|
* means the bio should NOT be touched after the call to ->make_request_fn.
|
|
*/
|
|
blk_qc_t generic_make_request(struct bio *bio)
|
|
{
|
|
/*
|
|
* bio_list_on_stack[0] contains bios submitted by the current
|
|
* make_request_fn.
|
|
* bio_list_on_stack[1] contains bios that were submitted before
|
|
* the current make_request_fn, but that haven't been processed
|
|
* yet.
|
|
*/
|
|
struct bio_list bio_list_on_stack[2];
|
|
blk_qc_t ret = BLK_QC_T_NONE;
|
|
|
|
if (!generic_make_request_checks(bio))
|
|
goto out;
|
|
|
|
/*
|
|
* We only want one ->make_request_fn to be active at a time, else
|
|
* stack usage with stacked devices could be a problem. So use
|
|
* current->bio_list to keep a list of requests submited by a
|
|
* make_request_fn function. current->bio_list is also used as a
|
|
* flag to say if generic_make_request is currently active in this
|
|
* task or not. If it is NULL, then no make_request is active. If
|
|
* it is non-NULL, then a make_request is active, and new requests
|
|
* should be added at the tail
|
|
*/
|
|
if (current->bio_list) {
|
|
bio_list_add(¤t->bio_list[0], bio);
|
|
goto out;
|
|
}
|
|
|
|
/* following loop may be a bit non-obvious, and so deserves some
|
|
* explanation.
|
|
* Before entering the loop, bio->bi_next is NULL (as all callers
|
|
* ensure that) so we have a list with a single bio.
|
|
* We pretend that we have just taken it off a longer list, so
|
|
* we assign bio_list to a pointer to the bio_list_on_stack,
|
|
* thus initialising the bio_list of new bios to be
|
|
* added. ->make_request() may indeed add some more bios
|
|
* through a recursive call to generic_make_request. If it
|
|
* did, we find a non-NULL value in bio_list and re-enter the loop
|
|
* from the top. In this case we really did just take the bio
|
|
* of the top of the list (no pretending) and so remove it from
|
|
* bio_list, and call into ->make_request() again.
|
|
*/
|
|
BUG_ON(bio->bi_next);
|
|
bio_list_init(&bio_list_on_stack[0]);
|
|
current->bio_list = bio_list_on_stack;
|
|
do {
|
|
struct request_queue *q = bio->bi_disk->queue;
|
|
blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
|
|
BLK_MQ_REQ_NOWAIT : 0;
|
|
|
|
if (likely(blk_queue_enter(q, flags) == 0)) {
|
|
struct bio_list lower, same;
|
|
|
|
/* Create a fresh bio_list for all subordinate requests */
|
|
bio_list_on_stack[1] = bio_list_on_stack[0];
|
|
bio_list_init(&bio_list_on_stack[0]);
|
|
ret = q->make_request_fn(q, bio);
|
|
|
|
blk_queue_exit(q);
|
|
|
|
/* sort new bios into those for a lower level
|
|
* and those for the same level
|
|
*/
|
|
bio_list_init(&lower);
|
|
bio_list_init(&same);
|
|
while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
|
|
if (q == bio->bi_disk->queue)
|
|
bio_list_add(&same, bio);
|
|
else
|
|
bio_list_add(&lower, bio);
|
|
/* now assemble so we handle the lowest level first */
|
|
bio_list_merge(&bio_list_on_stack[0], &lower);
|
|
bio_list_merge(&bio_list_on_stack[0], &same);
|
|
bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
|
|
} else {
|
|
if (unlikely(!blk_queue_dying(q) &&
|
|
(bio->bi_opf & REQ_NOWAIT)))
|
|
bio_wouldblock_error(bio);
|
|
else
|
|
bio_io_error(bio);
|
|
}
|
|
bio = bio_list_pop(&bio_list_on_stack[0]);
|
|
} while (bio);
|
|
current->bio_list = NULL; /* deactivate */
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(generic_make_request);
|
|
|
|
/**
|
|
* direct_make_request - hand a buffer directly to its device driver for I/O
|
|
* @bio: The bio describing the location in memory and on the device.
|
|
*
|
|
* This function behaves like generic_make_request(), but does not protect
|
|
* against recursion. Must only be used if the called driver is known
|
|
* to not call generic_make_request (or direct_make_request) again from
|
|
* its make_request function. (Calling direct_make_request again from
|
|
* a workqueue is perfectly fine as that doesn't recurse).
|
|
*/
|
|
blk_qc_t direct_make_request(struct bio *bio)
|
|
{
|
|
struct request_queue *q = bio->bi_disk->queue;
|
|
bool nowait = bio->bi_opf & REQ_NOWAIT;
|
|
blk_qc_t ret;
|
|
|
|
if (!generic_make_request_checks(bio))
|
|
return BLK_QC_T_NONE;
|
|
|
|
if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
|
|
if (nowait && !blk_queue_dying(q))
|
|
bio->bi_status = BLK_STS_AGAIN;
|
|
else
|
|
bio->bi_status = BLK_STS_IOERR;
|
|
bio_endio(bio);
|
|
return BLK_QC_T_NONE;
|
|
}
|
|
|
|
ret = q->make_request_fn(q, bio);
|
|
blk_queue_exit(q);
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL_GPL(direct_make_request);
|
|
|
|
/**
|
|
* submit_bio - submit a bio to the block device layer for I/O
|
|
* @bio: The &struct bio which describes the I/O
|
|
*
|
|
* submit_bio() is very similar in purpose to generic_make_request(), and
|
|
* uses that function to do most of the work. Both are fairly rough
|
|
* interfaces; @bio must be presetup and ready for I/O.
|
|
*
|
|
*/
|
|
blk_qc_t submit_bio(struct bio *bio)
|
|
{
|
|
if (blkcg_punt_bio_submit(bio))
|
|
return BLK_QC_T_NONE;
|
|
|
|
/*
|
|
* If it's a regular read/write or a barrier with data attached,
|
|
* go through the normal accounting stuff before submission.
|
|
*/
|
|
if (bio_has_data(bio)) {
|
|
unsigned int count;
|
|
|
|
if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
|
|
count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
|
|
else
|
|
count = bio_sectors(bio);
|
|
|
|
if (op_is_write(bio_op(bio))) {
|
|
count_vm_events(PGPGOUT, count);
|
|
} else {
|
|
task_io_account_read(bio->bi_iter.bi_size);
|
|
count_vm_events(PGPGIN, count);
|
|
}
|
|
|
|
if (unlikely(block_dump)) {
|
|
char b[BDEVNAME_SIZE];
|
|
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
|
|
current->comm, task_pid_nr(current),
|
|
op_is_write(bio_op(bio)) ? "WRITE" : "READ",
|
|
(unsigned long long)bio->bi_iter.bi_sector,
|
|
bio_devname(bio, b), count);
|
|
}
|
|
}
|
|
|
|
return generic_make_request(bio);
|
|
}
|
|
EXPORT_SYMBOL(submit_bio);
|
|
|
|
/**
|
|
* blk_cloned_rq_check_limits - Helper function to check a cloned request
|
|
* for new the queue limits
|
|
* @q: the queue
|
|
* @rq: the request being checked
|
|
*
|
|
* Description:
|
|
* @rq may have been made based on weaker limitations of upper-level queues
|
|
* in request stacking drivers, and it may violate the limitation of @q.
|
|
* Since the block layer and the underlying device driver trust @rq
|
|
* after it is inserted to @q, it should be checked against @q before
|
|
* the insertion using this generic function.
|
|
*
|
|
* Request stacking drivers like request-based dm may change the queue
|
|
* limits when retrying requests on other queues. Those requests need
|
|
* to be checked against the new queue limits again during dispatch.
|
|
*/
|
|
static int blk_cloned_rq_check_limits(struct request_queue *q,
|
|
struct request *rq)
|
|
{
|
|
if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) {
|
|
printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
|
|
__func__, blk_rq_sectors(rq),
|
|
blk_queue_get_max_sectors(q, req_op(rq)));
|
|
return -EIO;
|
|
}
|
|
|
|
/*
|
|
* queue's settings related to segment counting like q->bounce_pfn
|
|
* may differ from that of other stacking queues.
|
|
* Recalculate it to check the request correctly on this queue's
|
|
* limitation.
|
|
*/
|
|
rq->nr_phys_segments = blk_recalc_rq_segments(rq);
|
|
if (rq->nr_phys_segments > queue_max_segments(q)) {
|
|
printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
|
|
__func__, rq->nr_phys_segments, queue_max_segments(q));
|
|
return -EIO;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* blk_insert_cloned_request - Helper for stacking drivers to submit a request
|
|
* @q: the queue to submit the request
|
|
* @rq: the request being queued
|
|
*/
|
|
blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
|
|
{
|
|
if (blk_cloned_rq_check_limits(q, rq))
|
|
return BLK_STS_IOERR;
|
|
|
|
if (rq->rq_disk &&
|
|
should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
|
|
return BLK_STS_IOERR;
|
|
|
|
if (blk_queue_io_stat(q))
|
|
blk_account_io_start(rq, true);
|
|
|
|
/*
|
|
* Since we have a scheduler attached on the top device,
|
|
* bypass a potential scheduler on the bottom device for
|
|
* insert.
|
|
*/
|
|
return blk_mq_request_issue_directly(rq, true);
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
|
|
|
|
/**
|
|
* blk_rq_err_bytes - determine number of bytes till the next failure boundary
|
|
* @rq: request to examine
|
|
*
|
|
* Description:
|
|
* A request could be merge of IOs which require different failure
|
|
* handling. This function determines the number of bytes which
|
|
* can be failed from the beginning of the request without
|
|
* crossing into area which need to be retried further.
|
|
*
|
|
* Return:
|
|
* The number of bytes to fail.
|
|
*/
|
|
unsigned int blk_rq_err_bytes(const struct request *rq)
|
|
{
|
|
unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
|
|
unsigned int bytes = 0;
|
|
struct bio *bio;
|
|
|
|
if (!(rq->rq_flags & RQF_MIXED_MERGE))
|
|
return blk_rq_bytes(rq);
|
|
|
|
/*
|
|
* Currently the only 'mixing' which can happen is between
|
|
* different fastfail types. We can safely fail portions
|
|
* which have all the failfast bits that the first one has -
|
|
* the ones which are at least as eager to fail as the first
|
|
* one.
|
|
*/
|
|
for (bio = rq->bio; bio; bio = bio->bi_next) {
|
|
if ((bio->bi_opf & ff) != ff)
|
|
break;
|
|
bytes += bio->bi_iter.bi_size;
|
|
}
|
|
|
|
/* this could lead to infinite loop */
|
|
BUG_ON(blk_rq_bytes(rq) && !bytes);
|
|
return bytes;
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
|
|
|
|
void blk_account_io_completion(struct request *req, unsigned int bytes)
|
|
{
|
|
if (blk_do_io_stat(req)) {
|
|
const int sgrp = op_stat_group(req_op(req));
|
|
struct hd_struct *part;
|
|
|
|
part_stat_lock();
|
|
part = req->part;
|
|
part_stat_add(part, sectors[sgrp], bytes >> 9);
|
|
part_stat_unlock();
|
|
}
|
|
}
|
|
|
|
void blk_account_io_done(struct request *req, u64 now)
|
|
{
|
|
/*
|
|
* Account IO completion. flush_rq isn't accounted as a
|
|
* normal IO on queueing nor completion. Accounting the
|
|
* containing request is enough.
|
|
*/
|
|
if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
|
|
const int sgrp = op_stat_group(req_op(req));
|
|
struct hd_struct *part;
|
|
|
|
part_stat_lock();
|
|
part = req->part;
|
|
|
|
update_io_ticks(part, jiffies);
|
|
part_stat_inc(part, ios[sgrp]);
|
|
part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
|
|
part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns));
|
|
part_dec_in_flight(req->q, part, rq_data_dir(req));
|
|
|
|
hd_struct_put(part);
|
|
part_stat_unlock();
|
|
}
|
|
}
|
|
|
|
void blk_account_io_start(struct request *rq, bool new_io)
|
|
{
|
|
struct hd_struct *part;
|
|
int rw = rq_data_dir(rq);
|
|
|
|
if (!blk_do_io_stat(rq))
|
|
return;
|
|
|
|
part_stat_lock();
|
|
|
|
if (!new_io) {
|
|
part = rq->part;
|
|
part_stat_inc(part, merges[rw]);
|
|
} else {
|
|
part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
|
|
if (!hd_struct_try_get(part)) {
|
|
/*
|
|
* The partition is already being removed,
|
|
* the request will be accounted on the disk only
|
|
*
|
|
* We take a reference on disk->part0 although that
|
|
* partition will never be deleted, so we can treat
|
|
* it as any other partition.
|
|
*/
|
|
part = &rq->rq_disk->part0;
|
|
hd_struct_get(part);
|
|
}
|
|
part_inc_in_flight(rq->q, part, rw);
|
|
rq->part = part;
|
|
}
|
|
|
|
update_io_ticks(part, jiffies);
|
|
|
|
part_stat_unlock();
|
|
}
|
|
|
|
/*
|
|
* Steal bios from a request and add them to a bio list.
|
|
* The request must not have been partially completed before.
|
|
*/
|
|
void blk_steal_bios(struct bio_list *list, struct request *rq)
|
|
{
|
|
if (rq->bio) {
|
|
if (list->tail)
|
|
list->tail->bi_next = rq->bio;
|
|
else
|
|
list->head = rq->bio;
|
|
list->tail = rq->biotail;
|
|
|
|
rq->bio = NULL;
|
|
rq->biotail = NULL;
|
|
}
|
|
|
|
rq->__data_len = 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_steal_bios);
|
|
|
|
/**
|
|
* blk_update_request - Special helper function for request stacking drivers
|
|
* @req: the request being processed
|
|
* @error: block status code
|
|
* @nr_bytes: number of bytes to complete @req
|
|
*
|
|
* Description:
|
|
* Ends I/O on a number of bytes attached to @req, but doesn't complete
|
|
* the request structure even if @req doesn't have leftover.
|
|
* If @req has leftover, sets it up for the next range of segments.
|
|
*
|
|
* This special helper function is only for request stacking drivers
|
|
* (e.g. request-based dm) so that they can handle partial completion.
|
|
* Actual device drivers should use blk_mq_end_request instead.
|
|
*
|
|
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
|
|
* %false return from this function.
|
|
*
|
|
* Note:
|
|
* The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
|
|
* blk_rq_bytes() and in blk_update_request().
|
|
*
|
|
* Return:
|
|
* %false - this request doesn't have any more data
|
|
* %true - this request has more data
|
|
**/
|
|
bool blk_update_request(struct request *req, blk_status_t error,
|
|
unsigned int nr_bytes)
|
|
{
|
|
int total_bytes;
|
|
|
|
trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);
|
|
|
|
if (!req->bio)
|
|
return false;
|
|
|
|
if (unlikely(error && !blk_rq_is_passthrough(req) &&
|
|
!(req->rq_flags & RQF_QUIET)))
|
|
print_req_error(req, error, __func__);
|
|
|
|
blk_account_io_completion(req, nr_bytes);
|
|
|
|
total_bytes = 0;
|
|
while (req->bio) {
|
|
struct bio *bio = req->bio;
|
|
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
|
|
|
|
if (bio_bytes == bio->bi_iter.bi_size)
|
|
req->bio = bio->bi_next;
|
|
|
|
/* Completion has already been traced */
|
|
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
|
|
req_bio_endio(req, bio, bio_bytes, error);
|
|
|
|
total_bytes += bio_bytes;
|
|
nr_bytes -= bio_bytes;
|
|
|
|
if (!nr_bytes)
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* completely done
|
|
*/
|
|
if (!req->bio) {
|
|
/*
|
|
* Reset counters so that the request stacking driver
|
|
* can find how many bytes remain in the request
|
|
* later.
|
|
*/
|
|
req->__data_len = 0;
|
|
return false;
|
|
}
|
|
|
|
req->__data_len -= total_bytes;
|
|
|
|
/* update sector only for requests with clear definition of sector */
|
|
if (!blk_rq_is_passthrough(req))
|
|
req->__sector += total_bytes >> 9;
|
|
|
|
/* mixed attributes always follow the first bio */
|
|
if (req->rq_flags & RQF_MIXED_MERGE) {
|
|
req->cmd_flags &= ~REQ_FAILFAST_MASK;
|
|
req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
|
|
}
|
|
|
|
if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
|
|
/*
|
|
* If total number of sectors is less than the first segment
|
|
* size, something has gone terribly wrong.
|
|
*/
|
|
if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
|
|
blk_dump_rq_flags(req, "request botched");
|
|
req->__data_len = blk_rq_cur_bytes(req);
|
|
}
|
|
|
|
/* recalculate the number of segments */
|
|
req->nr_phys_segments = blk_recalc_rq_segments(req);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_update_request);
|
|
|
|
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
|
|
/**
|
|
* rq_flush_dcache_pages - Helper function to flush all pages in a request
|
|
* @rq: the request to be flushed
|
|
*
|
|
* Description:
|
|
* Flush all pages in @rq.
|
|
*/
|
|
void rq_flush_dcache_pages(struct request *rq)
|
|
{
|
|
struct req_iterator iter;
|
|
struct bio_vec bvec;
|
|
|
|
rq_for_each_segment(bvec, rq, iter)
|
|
flush_dcache_page(bvec.bv_page);
|
|
}
|
|
EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
|
|
#endif
|
|
|
|
/**
|
|
* blk_lld_busy - Check if underlying low-level drivers of a device are busy
|
|
* @q : the queue of the device being checked
|
|
*
|
|
* Description:
|
|
* Check if underlying low-level drivers of a device are busy.
|
|
* If the drivers want to export their busy state, they must set own
|
|
* exporting function using blk_queue_lld_busy() first.
|
|
*
|
|
* Basically, this function is used only by request stacking drivers
|
|
* to stop dispatching requests to underlying devices when underlying
|
|
* devices are busy. This behavior helps more I/O merging on the queue
|
|
* of the request stacking driver and prevents I/O throughput regression
|
|
* on burst I/O load.
|
|
*
|
|
* Return:
|
|
* 0 - Not busy (The request stacking driver should dispatch request)
|
|
* 1 - Busy (The request stacking driver should stop dispatching request)
|
|
*/
|
|
int blk_lld_busy(struct request_queue *q)
|
|
{
|
|
if (queue_is_mq(q) && q->mq_ops->busy)
|
|
return q->mq_ops->busy(q);
|
|
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_lld_busy);
|
|
|
|
/**
|
|
* blk_rq_unprep_clone - Helper function to free all bios in a cloned request
|
|
* @rq: the clone request to be cleaned up
|
|
*
|
|
* Description:
|
|
* Free all bios in @rq for a cloned request.
|
|
*/
|
|
void blk_rq_unprep_clone(struct request *rq)
|
|
{
|
|
struct bio *bio;
|
|
|
|
while ((bio = rq->bio) != NULL) {
|
|
rq->bio = bio->bi_next;
|
|
|
|
bio_put(bio);
|
|
}
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
|
|
|
|
/*
|
|
* Copy attributes of the original request to the clone request.
|
|
* The actual data parts (e.g. ->cmd, ->sense) are not copied.
|
|
*/
|
|
static void __blk_rq_prep_clone(struct request *dst, struct request *src)
|
|
{
|
|
dst->__sector = blk_rq_pos(src);
|
|
dst->__data_len = blk_rq_bytes(src);
|
|
if (src->rq_flags & RQF_SPECIAL_PAYLOAD) {
|
|
dst->rq_flags |= RQF_SPECIAL_PAYLOAD;
|
|
dst->special_vec = src->special_vec;
|
|
}
|
|
dst->nr_phys_segments = src->nr_phys_segments;
|
|
dst->ioprio = src->ioprio;
|
|
dst->extra_len = src->extra_len;
|
|
}
|
|
|
|
/**
|
|
* blk_rq_prep_clone - Helper function to setup clone request
|
|
* @rq: the request to be setup
|
|
* @rq_src: original request to be cloned
|
|
* @bs: bio_set that bios for clone are allocated from
|
|
* @gfp_mask: memory allocation mask for bio
|
|
* @bio_ctr: setup function to be called for each clone bio.
|
|
* Returns %0 for success, non %0 for failure.
|
|
* @data: private data to be passed to @bio_ctr
|
|
*
|
|
* Description:
|
|
* Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
|
|
* The actual data parts of @rq_src (e.g. ->cmd, ->sense)
|
|
* are not copied, and copying such parts is the caller's responsibility.
|
|
* Also, pages which the original bios are pointing to are not copied
|
|
* and the cloned bios just point same pages.
|
|
* So cloned bios must be completed before original bios, which means
|
|
* the caller must complete @rq before @rq_src.
|
|
*/
|
|
int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
|
|
struct bio_set *bs, gfp_t gfp_mask,
|
|
int (*bio_ctr)(struct bio *, struct bio *, void *),
|
|
void *data)
|
|
{
|
|
struct bio *bio, *bio_src;
|
|
|
|
if (!bs)
|
|
bs = &fs_bio_set;
|
|
|
|
__rq_for_each_bio(bio_src, rq_src) {
|
|
bio = bio_clone_fast(bio_src, gfp_mask, bs);
|
|
if (!bio)
|
|
goto free_and_out;
|
|
|
|
if (bio_ctr && bio_ctr(bio, bio_src, data))
|
|
goto free_and_out;
|
|
|
|
if (rq->bio) {
|
|
rq->biotail->bi_next = bio;
|
|
rq->biotail = bio;
|
|
} else
|
|
rq->bio = rq->biotail = bio;
|
|
}
|
|
|
|
__blk_rq_prep_clone(rq, rq_src);
|
|
|
|
return 0;
|
|
|
|
free_and_out:
|
|
if (bio)
|
|
bio_put(bio);
|
|
blk_rq_unprep_clone(rq);
|
|
|
|
return -ENOMEM;
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
|
|
|
|
int kblockd_schedule_work(struct work_struct *work)
|
|
{
|
|
return queue_work(kblockd_workqueue, work);
|
|
}
|
|
EXPORT_SYMBOL(kblockd_schedule_work);
|
|
|
|
int kblockd_schedule_work_on(int cpu, struct work_struct *work)
|
|
{
|
|
return queue_work_on(cpu, kblockd_workqueue, work);
|
|
}
|
|
EXPORT_SYMBOL(kblockd_schedule_work_on);
|
|
|
|
int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
|
|
unsigned long delay)
|
|
{
|
|
return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
|
|
}
|
|
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
|
|
|
|
/**
|
|
* blk_start_plug - initialize blk_plug and track it inside the task_struct
|
|
* @plug: The &struct blk_plug that needs to be initialized
|
|
*
|
|
* Description:
|
|
* blk_start_plug() indicates to the block layer an intent by the caller
|
|
* to submit multiple I/O requests in a batch. The block layer may use
|
|
* this hint to defer submitting I/Os from the caller until blk_finish_plug()
|
|
* is called. However, the block layer may choose to submit requests
|
|
* before a call to blk_finish_plug() if the number of queued I/Os
|
|
* exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
|
|
* %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if
|
|
* the task schedules (see below).
|
|
*
|
|
* Tracking blk_plug inside the task_struct will help with auto-flushing the
|
|
* pending I/O should the task end up blocking between blk_start_plug() and
|
|
* blk_finish_plug(). This is important from a performance perspective, but
|
|
* also ensures that we don't deadlock. For instance, if the task is blocking
|
|
* for a memory allocation, memory reclaim could end up wanting to free a
|
|
* page belonging to that request that is currently residing in our private
|
|
* plug. By flushing the pending I/O when the process goes to sleep, we avoid
|
|
* this kind of deadlock.
|
|
*/
|
|
void blk_start_plug(struct blk_plug *plug)
|
|
{
|
|
struct task_struct *tsk = current;
|
|
|
|
/*
|
|
* If this is a nested plug, don't actually assign it.
|
|
*/
|
|
if (tsk->plug)
|
|
return;
|
|
|
|
INIT_LIST_HEAD(&plug->mq_list);
|
|
INIT_LIST_HEAD(&plug->cb_list);
|
|
plug->rq_count = 0;
|
|
plug->multiple_queues = false;
|
|
|
|
/*
|
|
* Store ordering should not be needed here, since a potential
|
|
* preempt will imply a full memory barrier
|
|
*/
|
|
tsk->plug = plug;
|
|
}
|
|
EXPORT_SYMBOL(blk_start_plug);
|
|
|
|
static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
|
|
{
|
|
LIST_HEAD(callbacks);
|
|
|
|
while (!list_empty(&plug->cb_list)) {
|
|
list_splice_init(&plug->cb_list, &callbacks);
|
|
|
|
while (!list_empty(&callbacks)) {
|
|
struct blk_plug_cb *cb = list_first_entry(&callbacks,
|
|
struct blk_plug_cb,
|
|
list);
|
|
list_del(&cb->list);
|
|
cb->callback(cb, from_schedule);
|
|
}
|
|
}
|
|
}
|
|
|
|
struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
|
|
int size)
|
|
{
|
|
struct blk_plug *plug = current->plug;
|
|
struct blk_plug_cb *cb;
|
|
|
|
if (!plug)
|
|
return NULL;
|
|
|
|
list_for_each_entry(cb, &plug->cb_list, list)
|
|
if (cb->callback == unplug && cb->data == data)
|
|
return cb;
|
|
|
|
/* Not currently on the callback list */
|
|
BUG_ON(size < sizeof(*cb));
|
|
cb = kzalloc(size, GFP_ATOMIC);
|
|
if (cb) {
|
|
cb->data = data;
|
|
cb->callback = unplug;
|
|
list_add(&cb->list, &plug->cb_list);
|
|
}
|
|
return cb;
|
|
}
|
|
EXPORT_SYMBOL(blk_check_plugged);
|
|
|
|
void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
|
|
{
|
|
flush_plug_callbacks(plug, from_schedule);
|
|
|
|
if (!list_empty(&plug->mq_list))
|
|
blk_mq_flush_plug_list(plug, from_schedule);
|
|
}
|
|
|
|
/**
|
|
* blk_finish_plug - mark the end of a batch of submitted I/O
|
|
* @plug: The &struct blk_plug passed to blk_start_plug()
|
|
*
|
|
* Description:
|
|
* Indicate that a batch of I/O submissions is complete. This function
|
|
* must be paired with an initial call to blk_start_plug(). The intent
|
|
* is to allow the block layer to optimize I/O submission. See the
|
|
* documentation for blk_start_plug() for more information.
|
|
*/
|
|
void blk_finish_plug(struct blk_plug *plug)
|
|
{
|
|
if (plug != current->plug)
|
|
return;
|
|
blk_flush_plug_list(plug, false);
|
|
|
|
current->plug = NULL;
|
|
}
|
|
EXPORT_SYMBOL(blk_finish_plug);
|
|
|
|
int __init blk_dev_init(void)
|
|
{
|
|
BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
|
|
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
|
|
FIELD_SIZEOF(struct request, cmd_flags));
|
|
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
|
|
FIELD_SIZEOF(struct bio, bi_opf));
|
|
|
|
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
|
|
kblockd_workqueue = alloc_workqueue("kblockd",
|
|
WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
|
|
if (!kblockd_workqueue)
|
|
panic("Failed to create kblockd\n");
|
|
|
|
blk_requestq_cachep = kmem_cache_create("request_queue",
|
|
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
|
|
|
|
#ifdef CONFIG_DEBUG_FS
|
|
blk_debugfs_root = debugfs_create_dir("block", NULL);
|
|
#endif
|
|
|
|
return 0;
|
|
}
|