mirror of
https://github.com/torvalds/linux.git
synced 2024-12-29 06:12:08 +00:00
for-5.1/block-20190302
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAlx63XIQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpp2vEACfrrQsap7R+Av28mmXpmXi2FPa3g5Tev1t yYjK2qHvhlMZjPTYw3hCmbYdDDczlF7PEgSE2x2DjdcsYapb8Fy1lZ2X16c7ztBR HD/t9b5AVSQsczZzKgv3RqsNtTnjzS5V0A8XH8FAP2QRgiwDMwSN6G0FP0JBLbE/ ZgxQrH1Iy1F33Wz4hI3Z7dEghKPZrH1IlegkZCEu47q9SlWS76qUetSy2GEtchOl 3Lgu54mQZyVdI5/QZf9DyMDLF6dIz3tYU2qhuo01AHjGRCC72v86p8sIiXcUr94Q 8pbegJhJ/g8KBol9Qhv3+pWG/QUAZwi/ZwasTkK+MJ4klRXfOrznxPubW1z6t9Vn QRo39Po5SqqP0QWAscDxCFjESIQlWlKa+LZurJL7DJDCUGrSgzTpnVwFqKwc5zTP HJa5MT2tEeL2TfUYRYCfh0ZV0elINdHA1y1klDBh38drh4EWr2gW8xdseGYXqRjh fLgEpoF7VQ8kTvxKN+E4jZXkcZmoLmefp0ZyAbblS6IawpPVC7kXM9Fdn2OU8f2c fjVjvSiqxfeN6dnpfeLDRbbN9894HwgP/LPropJOQ7KmjCorQq5zMDkAvoh3tElq qwluRqdBJpWT/F05KweY+XVW8OawIycmUWqt6JrVNoIDAK31auHQv47kR0VA4OvE DRVVhYpocw== =VBaU -----END PGP SIGNATURE----- Merge tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block Pull block layer updates from Jens Axboe: "Not a huge amount of changes in this round, the biggest one is that we finally have Mings multi-page bvec support merged. Apart from that, this pull request contains: - Small series that avoids quiescing the queue for sysfs changes that match what we currently have (Aleksei) - Series of bcache fixes (via Coly) - Series of lightnvm fixes (via Mathias) - NVMe pull request from Christoph. Nothing major, just SPDX/license cleanups, RR mp policy (Hannes), and little fixes (Bart, Chaitanya). - BFQ series (Paolo) - Save blk-mq cpu -> hw queue mapping, removing a pointer indirection for the fast path (Jianchao) - fops->iopoll() added for async IO polling, this is a feature that the upcoming io_uring interface will use (Christoph, me) - Partition scan loop fixes (Dongli) - mtip32xx conversion from managed resource API (Christoph) - cdrom registration race fix (Guenter) - MD pull from Song, two minor fixes. - Various documentation fixes (Marcos) - Multi-page bvec feature. This brings a lot of nice improvements with it, like more efficient splitting, larger IOs can be supported without growing the bvec table size, and so on. (Ming) - Various little fixes to core and drivers" * tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block: (117 commits) block: fix updating bio's front segment size block: Replace function name in string with __func__ nbd: propagate genlmsg_reply return code floppy: remove set but not used variable 'q' null_blk: fix checking for REQ_FUA block: fix NULL pointer dereference in register_disk fs: fix guard_bio_eod to check for real EOD errors blk-mq: use HCTX_TYPE_DEFAULT but not 0 to index blk_mq_tag_set->map block: optimize bvec iteration in bvec_iter_advance block: introduce mp_bvec_for_each_page() for iterating over page block: optimize blk_bio_segment_split for single-page bvec block: optimize __blk_segment_map_sg() for single-page bvec block: introduce bvec_nth_page() iomap: wire up the iopoll method block: add bio_set_polled() helper block: wire up block device iopoll method fs: add an iopoll method to struct file_operations loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part() loop: do not print warn message if partition scan is successful block: bounce: make sure that bvec table is updated ...
This commit is contained in:
commit
80201fe175
@ -117,3 +117,28 @@ Other implications:
|
||||
size limitations and the limitations of the underlying devices. Thus
|
||||
there's no need to define ->merge_bvec_fn() callbacks for individual block
|
||||
drivers.
|
||||
|
||||
Usage of helpers:
|
||||
=================
|
||||
|
||||
* The following helpers whose names have the suffix of "_all" can only be used
|
||||
on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
|
||||
shouldn't use them because the bio may have been split before it reached the
|
||||
driver.
|
||||
|
||||
bio_for_each_segment_all()
|
||||
bio_first_bvec_all()
|
||||
bio_first_page_all()
|
||||
bio_last_bvec_all()
|
||||
|
||||
* The following helpers iterate over single-page segment. The passed 'struct
|
||||
bio_vec' will contain a single-page IO vector during the iteration
|
||||
|
||||
bio_for_each_segment()
|
||||
bio_for_each_segment_all()
|
||||
|
||||
* The following helpers iterate over multi-page bvec. The passed 'struct
|
||||
bio_vec' will contain a multi-page IO vector during the iteration
|
||||
|
||||
bio_for_each_bvec()
|
||||
rq_for_each_bvec()
|
||||
|
@ -857,6 +857,7 @@ struct file_operations {
|
||||
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
|
||||
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
|
||||
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
|
||||
int (*iopoll)(struct kiocb *kiocb, bool spin);
|
||||
int (*iterate) (struct file *, struct dir_context *);
|
||||
int (*iterate_shared) (struct file *, struct dir_context *);
|
||||
__poll_t (*poll) (struct file *, struct poll_table_struct *);
|
||||
@ -902,6 +903,8 @@ otherwise noted.
|
||||
|
||||
write_iter: possibly asynchronous write with iov_iter as source
|
||||
|
||||
iopoll: called when aio wants to poll for completions on HIPRI iocbs
|
||||
|
||||
iterate: called when the VFS needs to read the directory contents
|
||||
|
||||
iterate_shared: called when the VFS needs to read the directory contents
|
||||
|
@ -230,11 +230,16 @@ static struct kmem_cache *bfq_pool;
|
||||
#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
|
||||
|
||||
/* hw_tag detection: parallel requests threshold and min samples needed. */
|
||||
#define BFQ_HW_QUEUE_THRESHOLD 4
|
||||
#define BFQ_HW_QUEUE_THRESHOLD 3
|
||||
#define BFQ_HW_QUEUE_SAMPLES 32
|
||||
|
||||
#define BFQQ_SEEK_THR (sector_t)(8 * 100)
|
||||
#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
|
||||
#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
|
||||
(get_sdist(last_pos, rq) > \
|
||||
BFQQ_SEEK_THR && \
|
||||
(!blk_queue_nonrot(bfqd->queue) || \
|
||||
blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
|
||||
#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
|
||||
#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
|
||||
|
||||
@ -623,26 +628,6 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
||||
bfqq->pos_root = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Tell whether there are active queues with different weights or
|
||||
* active groups.
|
||||
*/
|
||||
static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
|
||||
{
|
||||
/*
|
||||
* For queue weights to differ, queue_weights_tree must contain
|
||||
* at least two nodes.
|
||||
*/
|
||||
return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
|
||||
(bfqd->queue_weights_tree.rb_node->rb_left ||
|
||||
bfqd->queue_weights_tree.rb_node->rb_right)
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
) ||
|
||||
(bfqd->num_groups_with_pending_reqs > 0
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
* The following function returns true if every queue must receive the
|
||||
* same share of the throughput (this condition is used when deciding
|
||||
@ -651,25 +636,48 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
|
||||
*
|
||||
* Such a scenario occurs when:
|
||||
* 1) all active queues have the same weight,
|
||||
* 2) all active groups at the same level in the groups tree have the same
|
||||
* weight,
|
||||
* 2) all active queues belong to the same I/O-priority class,
|
||||
* 3) all active groups at the same level in the groups tree have the same
|
||||
* weight,
|
||||
* 4) all active groups at the same level in the groups tree have the same
|
||||
* number of children.
|
||||
*
|
||||
* Unfortunately, keeping the necessary state for evaluating exactly
|
||||
* the last two symmetry sub-conditions above would be quite complex
|
||||
* and time consuming. Therefore this function evaluates, instead,
|
||||
* only the following stronger two sub-conditions, for which it is
|
||||
* and time consuming. Therefore this function evaluates, instead,
|
||||
* only the following stronger three sub-conditions, for which it is
|
||||
* much easier to maintain the needed state:
|
||||
* 1) all active queues have the same weight,
|
||||
* 2) there are no active groups.
|
||||
* 2) all active queues belong to the same I/O-priority class,
|
||||
* 3) there are no active groups.
|
||||
* In particular, the last condition is always true if hierarchical
|
||||
* support or the cgroups interface are not enabled, thus no state
|
||||
* needs to be maintained in this case.
|
||||
*/
|
||||
static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
|
||||
{
|
||||
return !bfq_varied_queue_weights_or_active_groups(bfqd);
|
||||
/*
|
||||
* For queue weights to differ, queue_weights_tree must contain
|
||||
* at least two nodes.
|
||||
*/
|
||||
bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
|
||||
(bfqd->queue_weights_tree.rb_node->rb_left ||
|
||||
bfqd->queue_weights_tree.rb_node->rb_right);
|
||||
|
||||
bool multiple_classes_busy =
|
||||
(bfqd->busy_queues[0] && bfqd->busy_queues[1]) ||
|
||||
(bfqd->busy_queues[0] && bfqd->busy_queues[2]) ||
|
||||
(bfqd->busy_queues[1] && bfqd->busy_queues[2]);
|
||||
|
||||
/*
|
||||
* For queue weights to differ, queue_weights_tree must contain
|
||||
* at least two nodes.
|
||||
*/
|
||||
return !(varied_queue_weights || multiple_classes_busy
|
||||
#ifdef BFQ_GROUP_IOSCHED_ENABLED
|
||||
|| bfqd->num_groups_with_pending_reqs > 0
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -728,15 +736,14 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
/*
|
||||
* In the unlucky event of an allocation failure, we just
|
||||
* exit. This will cause the weight of queue to not be
|
||||
* considered in bfq_varied_queue_weights_or_active_groups,
|
||||
* which, in its turn, causes the scenario to be deemed
|
||||
* wrongly symmetric in case bfqq's weight would have been
|
||||
* the only weight making the scenario asymmetric. On the
|
||||
* bright side, no unbalance will however occur when bfqq
|
||||
* becomes inactive again (the invocation of this function
|
||||
* is triggered by an activation of queue). In fact,
|
||||
* bfq_weights_tree_remove does nothing if
|
||||
* !bfqq->weight_counter.
|
||||
* considered in bfq_symmetric_scenario, which, in its turn,
|
||||
* causes the scenario to be deemed wrongly symmetric in case
|
||||
* bfqq's weight would have been the only weight making the
|
||||
* scenario asymmetric. On the bright side, no unbalance will
|
||||
* however occur when bfqq becomes inactive again (the
|
||||
* invocation of this function is triggered by an activation
|
||||
* of queue). In fact, bfq_weights_tree_remove does nothing
|
||||
* if !bfqq->weight_counter.
|
||||
*/
|
||||
if (unlikely(!bfqq->weight_counter))
|
||||
return;
|
||||
@ -747,6 +754,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
|
||||
inc_counter:
|
||||
bfqq->weight_counter->num_active++;
|
||||
bfqq->ref++;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -771,6 +779,7 @@ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
|
||||
reset_entity_pointer:
|
||||
bfqq->weight_counter = NULL;
|
||||
bfq_put_queue(bfqq);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -782,9 +791,6 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
{
|
||||
struct bfq_entity *entity = bfqq->entity.parent;
|
||||
|
||||
__bfq_weights_tree_remove(bfqd, bfqq,
|
||||
&bfqd->queue_weights_tree);
|
||||
|
||||
for_each_entity(entity) {
|
||||
struct bfq_sched_data *sd = entity->my_sched_data;
|
||||
|
||||
@ -818,6 +824,15 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
bfqd->num_groups_with_pending_reqs--;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Next function is invoked last, because it causes bfqq to be
|
||||
* freed if the following holds: bfqq is not in service and
|
||||
* has no dispatched request. DO NOT use bfqq after the next
|
||||
* function invocation.
|
||||
*/
|
||||
__bfq_weights_tree_remove(bfqd, bfqq,
|
||||
&bfqd->queue_weights_tree);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -873,7 +888,8 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
|
||||
static unsigned long bfq_serv_to_charge(struct request *rq,
|
||||
struct bfq_queue *bfqq)
|
||||
{
|
||||
if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
|
||||
if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 ||
|
||||
!bfq_symmetric_scenario(bfqq->bfqd))
|
||||
return blk_rq_sectors(rq);
|
||||
|
||||
return blk_rq_sectors(rq) * bfq_async_charge_factor;
|
||||
@ -907,8 +923,10 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
|
||||
*/
|
||||
return;
|
||||
|
||||
new_budget = max_t(unsigned long, bfqq->max_budget,
|
||||
bfq_serv_to_charge(next_rq, bfqq));
|
||||
new_budget = max_t(unsigned long,
|
||||
max_t(unsigned long, bfqq->max_budget,
|
||||
bfq_serv_to_charge(next_rq, bfqq)),
|
||||
entity->service);
|
||||
if (entity->budget != new_budget) {
|
||||
entity->budget = new_budget;
|
||||
bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
|
||||
@ -1011,7 +1029,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
|
||||
|
||||
static int bfqq_process_refs(struct bfq_queue *bfqq)
|
||||
{
|
||||
return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
|
||||
return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
|
||||
(bfqq->weight_counter != NULL);
|
||||
}
|
||||
|
||||
/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
|
||||
@ -1380,7 +1399,15 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
|
||||
{
|
||||
struct bfq_entity *entity = &bfqq->entity;
|
||||
|
||||
if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
|
||||
/*
|
||||
* In the next compound condition, we check also whether there
|
||||
* is some budget left, because otherwise there is no point in
|
||||
* trying to go on serving bfqq with this same budget: bfqq
|
||||
* would be expired immediately after being selected for
|
||||
* service. This would only cause useless overhead.
|
||||
*/
|
||||
if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time &&
|
||||
bfq_bfqq_budget_left(bfqq) > 0) {
|
||||
/*
|
||||
* We do not clear the flag non_blocking_wait_rq here, as
|
||||
* the latter is used in bfq_activate_bfqq to signal
|
||||
@ -2217,14 +2244,15 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
return NULL;
|
||||
|
||||
/* If there is only one backlogged queue, don't search. */
|
||||
if (bfqd->busy_queues == 1)
|
||||
if (bfq_tot_busy_queues(bfqd) == 1)
|
||||
return NULL;
|
||||
|
||||
in_service_bfqq = bfqd->in_service_queue;
|
||||
|
||||
if (in_service_bfqq && in_service_bfqq != bfqq &&
|
||||
likely(in_service_bfqq != &bfqd->oom_bfqq) &&
|
||||
bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
|
||||
bfq_rq_close_to_sector(io_struct, request,
|
||||
bfqd->in_serv_last_pos) &&
|
||||
bfqq->entity.parent == in_service_bfqq->entity.parent &&
|
||||
bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
|
||||
new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
|
||||
@ -2742,7 +2770,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
|
||||
|
||||
if ((bfqd->rq_in_driver > 0 ||
|
||||
now_ns - bfqd->last_completion < BFQ_MIN_TT)
|
||||
&& get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
|
||||
&& !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
|
||||
bfqd->sequential_samples++;
|
||||
|
||||
bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
|
||||
@ -2764,6 +2792,8 @@ update_rate_and_reset:
|
||||
bfq_update_rate_reset(bfqd, rq);
|
||||
update_last_values:
|
||||
bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
|
||||
if (RQ_BFQQ(rq) == bfqd->in_service_queue)
|
||||
bfqd->in_serv_last_pos = bfqd->last_position;
|
||||
bfqd->last_dispatch = now_ns;
|
||||
}
|
||||
|
||||
@ -3274,16 +3304,32 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
|
||||
* requests, then the request pattern is isochronous
|
||||
* (see the comments on the function
|
||||
* bfq_bfqq_softrt_next_start()). Thus we can compute
|
||||
* soft_rt_next_start. If, instead, the queue still
|
||||
* has outstanding requests, then we have to wait for
|
||||
* the completion of all the outstanding requests to
|
||||
* discover whether the request pattern is actually
|
||||
* isochronous.
|
||||
* soft_rt_next_start. And we do it, unless bfqq is in
|
||||
* interactive weight raising. We do not do it in the
|
||||
* latter subcase, for the following reason. bfqq may
|
||||
* be conveying the I/O needed to load a soft
|
||||
* real-time application. Such an application will
|
||||
* actually exhibit a soft real-time I/O pattern after
|
||||
* it finally starts doing its job. But, if
|
||||
* soft_rt_next_start is computed here for an
|
||||
* interactive bfqq, and bfqq had received a lot of
|
||||
* service before remaining with no outstanding
|
||||
* request (likely to happen on a fast device), then
|
||||
* soft_rt_next_start would be assigned such a high
|
||||
* value that, for a very long time, bfqq would be
|
||||
* prevented from being possibly considered as soft
|
||||
* real time.
|
||||
*
|
||||
* If, instead, the queue still has outstanding
|
||||
* requests, then we have to wait for the completion
|
||||
* of all the outstanding requests to discover whether
|
||||
* the request pattern is actually isochronous.
|
||||
*/
|
||||
if (bfqq->dispatched == 0)
|
||||
if (bfqq->dispatched == 0 &&
|
||||
bfqq->wr_coeff != bfqd->bfq_wr_coeff)
|
||||
bfqq->soft_rt_next_start =
|
||||
bfq_bfqq_softrt_next_start(bfqd, bfqq);
|
||||
else {
|
||||
else if (bfqq->dispatched > 0) {
|
||||
/*
|
||||
* Schedule an update of soft_rt_next_start to when
|
||||
* the task may be discovered to be isochronous.
|
||||
@ -3376,53 +3422,13 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
|
||||
bfq_bfqq_budget_timeout(bfqq);
|
||||
}
|
||||
|
||||
/*
|
||||
* For a queue that becomes empty, device idling is allowed only if
|
||||
* this function returns true for the queue. As a consequence, since
|
||||
* device idling plays a critical role in both throughput boosting and
|
||||
* service guarantees, the return value of this function plays a
|
||||
* critical role in both these aspects as well.
|
||||
*
|
||||
* In a nutshell, this function returns true only if idling is
|
||||
* beneficial for throughput or, even if detrimental for throughput,
|
||||
* idling is however necessary to preserve service guarantees (low
|
||||
* latency, desired throughput distribution, ...). In particular, on
|
||||
* NCQ-capable devices, this function tries to return false, so as to
|
||||
* help keep the drives' internal queues full, whenever this helps the
|
||||
* device boost the throughput without causing any service-guarantee
|
||||
* issue.
|
||||
*
|
||||
* In more detail, the return value of this function is obtained by,
|
||||
* first, computing a number of boolean variables that take into
|
||||
* account throughput and service-guarantee issues, and, then,
|
||||
* combining these variables in a logical expression. Most of the
|
||||
* issues taken into account are not trivial. We discuss these issues
|
||||
* individually while introducing the variables.
|
||||
*/
|
||||
static bool bfq_better_to_idle(struct bfq_queue *bfqq)
|
||||
static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
|
||||
struct bfq_queue *bfqq)
|
||||
{
|
||||
struct bfq_data *bfqd = bfqq->bfqd;
|
||||
bool rot_without_queueing =
|
||||
!blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
|
||||
bfqq_sequential_and_IO_bound,
|
||||
idling_boosts_thr, idling_boosts_thr_without_issues,
|
||||
idling_needed_for_service_guarantees,
|
||||
asymmetric_scenario;
|
||||
|
||||
if (bfqd->strict_guarantees)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Idling is performed only if slice_idle > 0. In addition, we
|
||||
* do not idle if
|
||||
* (a) bfqq is async
|
||||
* (b) bfqq is in the idle io prio class: in this case we do
|
||||
* not idle because we want to minimize the bandwidth that
|
||||
* queues in this class can steal to higher-priority queues
|
||||
*/
|
||||
if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
|
||||
bfq_class_idle(bfqq))
|
||||
return false;
|
||||
idling_boosts_thr;
|
||||
|
||||
bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
|
||||
bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq);
|
||||
@ -3454,8 +3460,7 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
|
||||
bfqq_sequential_and_IO_bound);
|
||||
|
||||
/*
|
||||
* The value of the next variable,
|
||||
* idling_boosts_thr_without_issues, is equal to that of
|
||||
* The return value of this function is equal to that of
|
||||
* idling_boosts_thr, unless a special case holds. In this
|
||||
* special case, described below, idling may cause problems to
|
||||
* weight-raised queues.
|
||||
@ -3472,217 +3477,252 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
|
||||
* which enqueue several requests in advance, and further
|
||||
* reorder internally-queued requests.
|
||||
*
|
||||
* For this reason, we force to false the value of
|
||||
* idling_boosts_thr_without_issues if there are weight-raised
|
||||
* busy queues. In this case, and if bfqq is not weight-raised,
|
||||
* this guarantees that the device is not idled for bfqq (if,
|
||||
* instead, bfqq is weight-raised, then idling will be
|
||||
* guaranteed by another variable, see below). Combined with
|
||||
* the timestamping rules of BFQ (see [1] for details), this
|
||||
* behavior causes bfqq, and hence any sync non-weight-raised
|
||||
* queue, to get a lower number of requests served, and thus
|
||||
* to ask for a lower number of requests from the request
|
||||
* pool, before the busy weight-raised queues get served
|
||||
* again. This often mitigates starvation problems in the
|
||||
* presence of heavy write workloads and NCQ, thereby
|
||||
* guaranteeing a higher application and system responsiveness
|
||||
* in these hostile scenarios.
|
||||
* For this reason, we force to false the return value if
|
||||
* there are weight-raised busy queues. In this case, and if
|
||||
* bfqq is not weight-raised, this guarantees that the device
|
||||
* is not idled for bfqq (if, instead, bfqq is weight-raised,
|
||||
* then idling will be guaranteed by another variable, see
|
||||
* below). Combined with the timestamping rules of BFQ (see
|
||||
* [1] for details), this behavior causes bfqq, and hence any
|
||||
* sync non-weight-raised queue, to get a lower number of
|
||||
* requests served, and thus to ask for a lower number of
|
||||
* requests from the request pool, before the busy
|
||||
* weight-raised queues get served again. This often mitigates
|
||||
* starvation problems in the presence of heavy write
|
||||
* workloads and NCQ, thereby guaranteeing a higher
|
||||
* application and system responsiveness in these hostile
|
||||
* scenarios.
|
||||
*/
|
||||
idling_boosts_thr_without_issues = idling_boosts_thr &&
|
||||
return idling_boosts_thr &&
|
||||
bfqd->wr_busy_queues == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* There is then a case where idling must be performed not
|
||||
* for throughput concerns, but to preserve service
|
||||
* guarantees.
|
||||
*
|
||||
* To introduce this case, we can note that allowing the drive
|
||||
* to enqueue more than one request at a time, and hence
|
||||
* delegating de facto final scheduling decisions to the
|
||||
* drive's internal scheduler, entails loss of control on the
|
||||
* actual request service order. In particular, the critical
|
||||
* situation is when requests from different processes happen
|
||||
* to be present, at the same time, in the internal queue(s)
|
||||
* of the drive. In such a situation, the drive, by deciding
|
||||
* the service order of the internally-queued requests, does
|
||||
* determine also the actual throughput distribution among
|
||||
* these processes. But the drive typically has no notion or
|
||||
* concern about per-process throughput distribution, and
|
||||
* makes its decisions only on a per-request basis. Therefore,
|
||||
* the service distribution enforced by the drive's internal
|
||||
* scheduler is likely to coincide with the desired
|
||||
* device-throughput distribution only in a completely
|
||||
* symmetric scenario where:
|
||||
* (i) each of these processes must get the same throughput as
|
||||
* the others;
|
||||
* (ii) the I/O of each process has the same properties, in
|
||||
* terms of locality (sequential or random), direction
|
||||
* (reads or writes), request sizes, greediness
|
||||
* (from I/O-bound to sporadic), and so on.
|
||||
* In fact, in such a scenario, the drive tends to treat
|
||||
* the requests of each of these processes in about the same
|
||||
* way as the requests of the others, and thus to provide
|
||||
* each of these processes with about the same throughput
|
||||
* (which is exactly the desired throughput distribution). In
|
||||
* contrast, in any asymmetric scenario, device idling is
|
||||
* certainly needed to guarantee that bfqq receives its
|
||||
* assigned fraction of the device throughput (see [1] for
|
||||
* details).
|
||||
* The problem is that idling may significantly reduce
|
||||
* throughput with certain combinations of types of I/O and
|
||||
* devices. An important example is sync random I/O, on flash
|
||||
* storage with command queueing. So, unless bfqq falls in the
|
||||
* above cases where idling also boosts throughput, it would
|
||||
* be important to check conditions (i) and (ii) accurately,
|
||||
* so as to avoid idling when not strictly needed for service
|
||||
* guarantees.
|
||||
*
|
||||
* Unfortunately, it is extremely difficult to thoroughly
|
||||
* check condition (ii). And, in case there are active groups,
|
||||
* it becomes very difficult to check condition (i) too. In
|
||||
* fact, if there are active groups, then, for condition (i)
|
||||
* to become false, it is enough that an active group contains
|
||||
* more active processes or sub-groups than some other active
|
||||
* group. More precisely, for condition (i) to hold because of
|
||||
* such a group, it is not even necessary that the group is
|
||||
* (still) active: it is sufficient that, even if the group
|
||||
* has become inactive, some of its descendant processes still
|
||||
* have some request already dispatched but still waiting for
|
||||
* completion. In fact, requests have still to be guaranteed
|
||||
* their share of the throughput even after being
|
||||
* dispatched. In this respect, it is easy to show that, if a
|
||||
* group frequently becomes inactive while still having
|
||||
* in-flight requests, and if, when this happens, the group is
|
||||
* not considered in the calculation of whether the scenario
|
||||
* is asymmetric, then the group may fail to be guaranteed its
|
||||
* fair share of the throughput (basically because idling may
|
||||
* not be performed for the descendant processes of the group,
|
||||
* but it had to be). We address this issue with the
|
||||
* following bi-modal behavior, implemented in the function
|
||||
* bfq_symmetric_scenario().
|
||||
*
|
||||
* If there are groups with requests waiting for completion
|
||||
* (as commented above, some of these groups may even be
|
||||
* already inactive), then the scenario is tagged as
|
||||
* asymmetric, conservatively, without checking any of the
|
||||
* conditions (i) and (ii). So the device is idled for bfqq.
|
||||
* This behavior matches also the fact that groups are created
|
||||
* exactly if controlling I/O is a primary concern (to
|
||||
* preserve bandwidth and latency guarantees).
|
||||
*
|
||||
* On the opposite end, if there are no groups with requests
|
||||
* waiting for completion, then only condition (i) is actually
|
||||
* controlled, i.e., provided that condition (i) holds, idling
|
||||
* is not performed, regardless of whether condition (ii)
|
||||
* holds. In other words, only if condition (i) does not hold,
|
||||
* then idling is allowed, and the device tends to be
|
||||
* prevented from queueing many requests, possibly of several
|
||||
* processes. Since there are no groups with requests waiting
|
||||
* for completion, then, to control condition (i) it is enough
|
||||
* to check just whether all the queues with requests waiting
|
||||
* for completion also have the same weight.
|
||||
*
|
||||
* Not checking condition (ii) evidently exposes bfqq to the
|
||||
* risk of getting less throughput than its fair share.
|
||||
* However, for queues with the same weight, a further
|
||||
* mechanism, preemption, mitigates or even eliminates this
|
||||
* problem. And it does so without consequences on overall
|
||||
* throughput. This mechanism and its benefits are explained
|
||||
* in the next three paragraphs.
|
||||
*
|
||||
* Even if a queue, say Q, is expired when it remains idle, Q
|
||||
* can still preempt the new in-service queue if the next
|
||||
* request of Q arrives soon (see the comments on
|
||||
* bfq_bfqq_update_budg_for_activation). If all queues and
|
||||
* groups have the same weight, this form of preemption,
|
||||
* combined with the hole-recovery heuristic described in the
|
||||
* comments on function bfq_bfqq_update_budg_for_activation,
|
||||
* are enough to preserve a correct bandwidth distribution in
|
||||
* the mid term, even without idling. In fact, even if not
|
||||
* idling allows the internal queues of the device to contain
|
||||
* many requests, and thus to reorder requests, we can rather
|
||||
* safely assume that the internal scheduler still preserves a
|
||||
* minimum of mid-term fairness.
|
||||
*
|
||||
* More precisely, this preemption-based, idleless approach
|
||||
* provides fairness in terms of IOPS, and not sectors per
|
||||
* second. This can be seen with a simple example. Suppose
|
||||
* that there are two queues with the same weight, but that
|
||||
* the first queue receives requests of 8 sectors, while the
|
||||
* second queue receives requests of 1024 sectors. In
|
||||
* addition, suppose that each of the two queues contains at
|
||||
* most one request at a time, which implies that each queue
|
||||
* always remains idle after it is served. Finally, after
|
||||
* remaining idle, each queue receives very quickly a new
|
||||
* request. It follows that the two queues are served
|
||||
* alternatively, preempting each other if needed. This
|
||||
* implies that, although both queues have the same weight,
|
||||
* the queue with large requests receives a service that is
|
||||
* 1024/8 times as high as the service received by the other
|
||||
* queue.
|
||||
*
|
||||
* The motivation for using preemption instead of idling (for
|
||||
* queues with the same weight) is that, by not idling,
|
||||
* service guarantees are preserved (completely or at least in
|
||||
* part) without minimally sacrificing throughput. And, if
|
||||
* there is no active group, then the primary expectation for
|
||||
* this device is probably a high throughput.
|
||||
*
|
||||
* We are now left only with explaining the additional
|
||||
* compound condition that is checked below for deciding
|
||||
* whether the scenario is asymmetric. To explain this
|
||||
* compound condition, we need to add that the function
|
||||
* bfq_symmetric_scenario checks the weights of only
|
||||
* non-weight-raised queues, for efficiency reasons (see
|
||||
* comments on bfq_weights_tree_add()). Then the fact that
|
||||
* bfqq is weight-raised is checked explicitly here. More
|
||||
* precisely, the compound condition below takes into account
|
||||
* also the fact that, even if bfqq is being weight-raised,
|
||||
* the scenario is still symmetric if all queues with requests
|
||||
* waiting for completion happen to be
|
||||
* weight-raised. Actually, we should be even more precise
|
||||
* here, and differentiate between interactive weight raising
|
||||
* and soft real-time weight raising.
|
||||
*
|
||||
* As a side note, it is worth considering that the above
|
||||
* device-idling countermeasures may however fail in the
|
||||
* following unlucky scenario: if idling is (correctly)
|
||||
* disabled in a time period during which all symmetry
|
||||
* sub-conditions hold, and hence the device is allowed to
|
||||
* enqueue many requests, but at some later point in time some
|
||||
* sub-condition stops to hold, then it may become impossible
|
||||
* to let requests be served in the desired order until all
|
||||
* the requests already queued in the device have been served.
|
||||
*/
|
||||
asymmetric_scenario = (bfqq->wr_coeff > 1 &&
|
||||
bfqd->wr_busy_queues < bfqd->busy_queues) ||
|
||||
/*
|
||||
* There is a case where idling must be performed not for
|
||||
* throughput concerns, but to preserve service guarantees.
|
||||
*
|
||||
* To introduce this case, we can note that allowing the drive
|
||||
* to enqueue more than one request at a time, and hence
|
||||
* delegating de facto final scheduling decisions to the
|
||||
* drive's internal scheduler, entails loss of control on the
|
||||
* actual request service order. In particular, the critical
|
||||
* situation is when requests from different processes happen
|
||||
* to be present, at the same time, in the internal queue(s)
|
||||
* of the drive. In such a situation, the drive, by deciding
|
||||
* the service order of the internally-queued requests, does
|
||||
* determine also the actual throughput distribution among
|
||||
* these processes. But the drive typically has no notion or
|
||||
* concern about per-process throughput distribution, and
|
||||
* makes its decisions only on a per-request basis. Therefore,
|
||||
* the service distribution enforced by the drive's internal
|
||||
* scheduler is likely to coincide with the desired
|
||||
* device-throughput distribution only in a completely
|
||||
* symmetric scenario where:
|
||||
* (i) each of these processes must get the same throughput as
|
||||
* the others;
|
||||
* (ii) the I/O of each process has the same properties, in
|
||||
* terms of locality (sequential or random), direction
|
||||
* (reads or writes), request sizes, greediness
|
||||
* (from I/O-bound to sporadic), and so on.
|
||||
* In fact, in such a scenario, the drive tends to treat
|
||||
* the requests of each of these processes in about the same
|
||||
* way as the requests of the others, and thus to provide
|
||||
* each of these processes with about the same throughput
|
||||
* (which is exactly the desired throughput distribution). In
|
||||
* contrast, in any asymmetric scenario, device idling is
|
||||
* certainly needed to guarantee that bfqq receives its
|
||||
* assigned fraction of the device throughput (see [1] for
|
||||
* details).
|
||||
* The problem is that idling may significantly reduce
|
||||
* throughput with certain combinations of types of I/O and
|
||||
* devices. An important example is sync random I/O, on flash
|
||||
* storage with command queueing. So, unless bfqq falls in the
|
||||
* above cases where idling also boosts throughput, it would
|
||||
* be important to check conditions (i) and (ii) accurately,
|
||||
* so as to avoid idling when not strictly needed for service
|
||||
* guarantees.
|
||||
*
|
||||
* Unfortunately, it is extremely difficult to thoroughly
|
||||
* check condition (ii). And, in case there are active groups,
|
||||
* it becomes very difficult to check condition (i) too. In
|
||||
* fact, if there are active groups, then, for condition (i)
|
||||
* to become false, it is enough that an active group contains
|
||||
* more active processes or sub-groups than some other active
|
||||
* group. More precisely, for condition (i) to hold because of
|
||||
* such a group, it is not even necessary that the group is
|
||||
* (still) active: it is sufficient that, even if the group
|
||||
* has become inactive, some of its descendant processes still
|
||||
* have some request already dispatched but still waiting for
|
||||
* completion. In fact, requests have still to be guaranteed
|
||||
* their share of the throughput even after being
|
||||
* dispatched. In this respect, it is easy to show that, if a
|
||||
* group frequently becomes inactive while still having
|
||||
* in-flight requests, and if, when this happens, the group is
|
||||
* not considered in the calculation of whether the scenario
|
||||
* is asymmetric, then the group may fail to be guaranteed its
|
||||
* fair share of the throughput (basically because idling may
|
||||
* not be performed for the descendant processes of the group,
|
||||
* but it had to be). We address this issue with the
|
||||
* following bi-modal behavior, implemented in the function
|
||||
* bfq_symmetric_scenario().
|
||||
*
|
||||
* If there are groups with requests waiting for completion
|
||||
* (as commented above, some of these groups may even be
|
||||
* already inactive), then the scenario is tagged as
|
||||
* asymmetric, conservatively, without checking any of the
|
||||
* conditions (i) and (ii). So the device is idled for bfqq.
|
||||
* This behavior matches also the fact that groups are created
|
||||
* exactly if controlling I/O is a primary concern (to
|
||||
* preserve bandwidth and latency guarantees).
|
||||
*
|
||||
* On the opposite end, if there are no groups with requests
|
||||
* waiting for completion, then only condition (i) is actually
|
||||
* controlled, i.e., provided that condition (i) holds, idling
|
||||
* is not performed, regardless of whether condition (ii)
|
||||
* holds. In other words, only if condition (i) does not hold,
|
||||
* then idling is allowed, and the device tends to be
|
||||
* prevented from queueing many requests, possibly of several
|
||||
* processes. Since there are no groups with requests waiting
|
||||
* for completion, then, to control condition (i) it is enough
|
||||
* to check just whether all the queues with requests waiting
|
||||
* for completion also have the same weight.
|
||||
*
|
||||
* Not checking condition (ii) evidently exposes bfqq to the
|
||||
* risk of getting less throughput than its fair share.
|
||||
* However, for queues with the same weight, a further
|
||||
* mechanism, preemption, mitigates or even eliminates this
|
||||
* problem. And it does so without consequences on overall
|
||||
* throughput. This mechanism and its benefits are explained
|
||||
* in the next three paragraphs.
|
||||
*
|
||||
* Even if a queue, say Q, is expired when it remains idle, Q
|
||||
* can still preempt the new in-service queue if the next
|
||||
* request of Q arrives soon (see the comments on
|
||||
* bfq_bfqq_update_budg_for_activation). If all queues and
|
||||
* groups have the same weight, this form of preemption,
|
||||
* combined with the hole-recovery heuristic described in the
|
||||
* comments on function bfq_bfqq_update_budg_for_activation,
|
||||
* are enough to preserve a correct bandwidth distribution in
|
||||
* the mid term, even without idling. In fact, even if not
|
||||
* idling allows the internal queues of the device to contain
|
||||
* many requests, and thus to reorder requests, we can rather
|
||||
* safely assume that the internal scheduler still preserves a
|
||||
* minimum of mid-term fairness.
|
||||
*
|
||||
* More precisely, this preemption-based, idleless approach
|
||||
* provides fairness in terms of IOPS, and not sectors per
|
||||
* second. This can be seen with a simple example. Suppose
|
||||
* that there are two queues with the same weight, but that
|
||||
* the first queue receives requests of 8 sectors, while the
|
||||
* second queue receives requests of 1024 sectors. In
|
||||
* addition, suppose that each of the two queues contains at
|
||||
* most one request at a time, which implies that each queue
|
||||
* always remains idle after it is served. Finally, after
|
||||
* remaining idle, each queue receives very quickly a new
|
||||
* request. It follows that the two queues are served
|
||||
* alternatively, preempting each other if needed. This
|
||||
* implies that, although both queues have the same weight,
|
||||
* the queue with large requests receives a service that is
|
||||
* 1024/8 times as high as the service received by the other
|
||||
* queue.
|
||||
*
|
||||
* The motivation for using preemption instead of idling (for
|
||||
* queues with the same weight) is that, by not idling,
|
||||
* service guarantees are preserved (completely or at least in
|
||||
* part) without minimally sacrificing throughput. And, if
|
||||
* there is no active group, then the primary expectation for
|
||||
* this device is probably a high throughput.
|
||||
*
|
||||
* We are now left only with explaining the additional
|
||||
* compound condition that is checked below for deciding
|
||||
* whether the scenario is asymmetric. To explain this
|
||||
* compound condition, we need to add that the function
|
||||
* bfq_symmetric_scenario checks the weights of only
|
||||
* non-weight-raised queues, for efficiency reasons (see
|
||||
* comments on bfq_weights_tree_add()). Then the fact that
|
||||
* bfqq is weight-raised is checked explicitly here. More
|
||||
* precisely, the compound condition below takes into account
|
||||
* also the fact that, even if bfqq is being weight-raised,
|
||||
* the scenario is still symmetric if all queues with requests
|
||||
* waiting for completion happen to be
|
||||
* weight-raised. Actually, we should be even more precise
|
||||
* here, and differentiate between interactive weight raising
|
||||
* and soft real-time weight raising.
|
||||
*
|
||||
* As a side note, it is worth considering that the above
|
||||
* device-idling countermeasures may however fail in the
|
||||
* following unlucky scenario: if idling is (correctly)
|
||||
* disabled in a time period during which all symmetry
|
||||
* sub-conditions hold, and hence the device is allowed to
|
||||
* enqueue many requests, but at some later point in time some
|
||||
* sub-condition stops to hold, then it may become impossible
|
||||
* to let requests be served in the desired order until all
|
||||
* the requests already queued in the device have been served.
|
||||
*/
|
||||
static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
|
||||
struct bfq_queue *bfqq)
|
||||
{
|
||||
return (bfqq->wr_coeff > 1 &&
|
||||
bfqd->wr_busy_queues <
|
||||
bfq_tot_busy_queues(bfqd)) ||
|
||||
!bfq_symmetric_scenario(bfqd);
|
||||
}
|
||||
|
||||
/*
|
||||
* For a queue that becomes empty, device idling is allowed only if
|
||||
* this function returns true for that queue. As a consequence, since
|
||||
* device idling plays a critical role for both throughput boosting
|
||||
* and service guarantees, the return value of this function plays a
|
||||
* critical role as well.
|
||||
*
|
||||
* In a nutshell, this function returns true only if idling is
|
||||
* beneficial for throughput or, even if detrimental for throughput,
|
||||
* idling is however necessary to preserve service guarantees (low
|
||||
* latency, desired throughput distribution, ...). In particular, on
|
||||
* NCQ-capable devices, this function tries to return false, so as to
|
||||
* help keep the drives' internal queues full, whenever this helps the
|
||||
* device boost the throughput without causing any service-guarantee
|
||||
* issue.
|
||||
*
|
||||
* Most of the issues taken into account to get the return value of
|
||||
* this function are not trivial. We discuss these issues in the two
|
||||
* functions providing the main pieces of information needed by this
|
||||
* function.
|
||||
*/
|
||||
static bool bfq_better_to_idle(struct bfq_queue *bfqq)
|
||||
{
|
||||
struct bfq_data *bfqd = bfqq->bfqd;
|
||||
bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar;
|
||||
|
||||
if (unlikely(bfqd->strict_guarantees))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Finally, there is a case where maximizing throughput is the
|
||||
* best choice even if it may cause unfairness toward
|
||||
* bfqq. Such a case is when bfqq became active in a burst of
|
||||
* queue activations. Queues that became active during a large
|
||||
* burst benefit only from throughput, as discussed in the
|
||||
* comments on bfq_handle_burst. Thus, if bfqq became active
|
||||
* in a burst and not idling the device maximizes throughput,
|
||||
* then the device must no be idled, because not idling the
|
||||
* device provides bfqq and all other queues in the burst with
|
||||
* maximum benefit. Combining this and the above case, we can
|
||||
* now establish when idling is actually needed to preserve
|
||||
* service guarantees.
|
||||
* Idling is performed only if slice_idle > 0. In addition, we
|
||||
* do not idle if
|
||||
* (a) bfqq is async
|
||||
* (b) bfqq is in the idle io prio class: in this case we do
|
||||
* not idle because we want to minimize the bandwidth that
|
||||
* queues in this class can steal to higher-priority queues
|
||||
*/
|
||||
idling_needed_for_service_guarantees =
|
||||
asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
|
||||
if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
|
||||
bfq_class_idle(bfqq))
|
||||
return false;
|
||||
|
||||
idling_boosts_thr_with_no_issue =
|
||||
idling_boosts_thr_without_issues(bfqd, bfqq);
|
||||
|
||||
idling_needed_for_service_guar =
|
||||
idling_needed_for_service_guarantees(bfqd, bfqq);
|
||||
|
||||
/*
|
||||
* We have now all the components we need to compute the
|
||||
* We have now the two components we need to compute the
|
||||
* return value of the function, which is true only if idling
|
||||
* either boosts the throughput (without issues), or is
|
||||
* necessary to preserve service guarantees.
|
||||
*/
|
||||
return idling_boosts_thr_without_issues ||
|
||||
idling_needed_for_service_guarantees;
|
||||
return idling_boosts_thr_with_no_issue ||
|
||||
idling_needed_for_service_guar;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3934,7 +3974,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
|
||||
* belongs to CLASS_IDLE and other queues are waiting for
|
||||
* service.
|
||||
*/
|
||||
if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
|
||||
if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
|
||||
goto return_rq;
|
||||
|
||||
bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
|
||||
@ -3952,7 +3992,7 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
|
||||
* most a call to dispatch for nothing
|
||||
*/
|
||||
return !list_empty_careful(&bfqd->dispatch) ||
|
||||
bfqd->busy_queues > 0;
|
||||
bfq_tot_busy_queues(bfqd) > 0;
|
||||
}
|
||||
|
||||
static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
||||
@ -4006,9 +4046,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
||||
goto start_rq;
|
||||
}
|
||||
|
||||
bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
|
||||
bfq_log(bfqd, "dispatch requests: %d busy queues",
|
||||
bfq_tot_busy_queues(bfqd));
|
||||
|
||||
if (bfqd->busy_queues == 0)
|
||||
if (bfq_tot_busy_queues(bfqd) == 0)
|
||||
goto exit;
|
||||
|
||||
/*
|
||||
@ -4488,10 +4529,7 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
struct request *rq)
|
||||
{
|
||||
bfqq->seek_history <<= 1;
|
||||
bfqq->seek_history |=
|
||||
get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
|
||||
(!blk_queue_nonrot(bfqd->queue) ||
|
||||
blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
|
||||
bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);
|
||||
}
|
||||
|
||||
static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
|
||||
@ -4560,28 +4598,31 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
|
||||
|
||||
/*
|
||||
* There is just this request queued: if the request
|
||||
* is small and the queue is not to be expired, then
|
||||
* just exit.
|
||||
* There is just this request queued: if
|
||||
* - the request is small, and
|
||||
* - we are idling to boost throughput, and
|
||||
* - the queue is not to be expired,
|
||||
* then just exit.
|
||||
*
|
||||
* In this way, if the device is being idled to wait
|
||||
* for a new request from the in-service queue, we
|
||||
* avoid unplugging the device and committing the
|
||||
* device to serve just a small request. On the
|
||||
* contrary, we wait for the block layer to decide
|
||||
* when to unplug the device: hopefully, new requests
|
||||
* will be merged to this one quickly, then the device
|
||||
* will be unplugged and larger requests will be
|
||||
* dispatched.
|
||||
* device to serve just a small request. In contrast
|
||||
* we wait for the block layer to decide when to
|
||||
* unplug the device: hopefully, new requests will be
|
||||
* merged to this one quickly, then the device will be
|
||||
* unplugged and larger requests will be dispatched.
|
||||
*/
|
||||
if (small_req && !budget_timeout)
|
||||
if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) &&
|
||||
!budget_timeout)
|
||||
return;
|
||||
|
||||
/*
|
||||
* A large enough request arrived, or the queue is to
|
||||
* be expired: in both cases disk idling is to be
|
||||
* stopped, so clear wait_request flag and reset
|
||||
* timer.
|
||||
* A large enough request arrived, or idling is being
|
||||
* performed to preserve service guarantees, or
|
||||
* finally the queue is to be expired: in all these
|
||||
* cases disk idling is to be stopped, so clear
|
||||
* wait_request flag and reset timer.
|
||||
*/
|
||||
bfq_clear_bfqq_wait_request(bfqq);
|
||||
hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
|
||||
@ -4607,8 +4648,6 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
|
||||
bool waiting, idle_timer_disabled = false;
|
||||
|
||||
if (new_bfqq) {
|
||||
if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
|
||||
new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
|
||||
/*
|
||||
* Release the request's reference to the old bfqq
|
||||
* and make sure one is taken to the shared queue.
|
||||
@ -4751,6 +4790,8 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
|
||||
|
||||
static void bfq_update_hw_tag(struct bfq_data *bfqd)
|
||||
{
|
||||
struct bfq_queue *bfqq = bfqd->in_service_queue;
|
||||
|
||||
bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
|
||||
bfqd->rq_in_driver);
|
||||
|
||||
@ -4763,7 +4804,18 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
|
||||
* sum is not exact, as it's not taking into account deactivated
|
||||
* requests.
|
||||
*/
|
||||
if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
|
||||
if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If active queue hasn't enough requests and can idle, bfq might not
|
||||
* dispatch sufficient requests to hardware. Don't zero hw_tag in this
|
||||
* case
|
||||
*/
|
||||
if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
|
||||
bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
|
||||
BFQ_HW_QUEUE_THRESHOLD &&
|
||||
bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
|
||||
return;
|
||||
|
||||
if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
|
||||
@ -4834,11 +4886,14 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
|
||||
* isochronous, and both requisites for this condition to hold
|
||||
* are now satisfied, then compute soft_rt_next_start (see the
|
||||
* comments on the function bfq_bfqq_softrt_next_start()). We
|
||||
* schedule this delayed check when bfqq expires, if it still
|
||||
* has in-flight requests.
|
||||
* do not compute soft_rt_next_start if bfqq is in interactive
|
||||
* weight raising (see the comments in bfq_bfqq_expire() for
|
||||
* an explanation). We schedule this delayed update when bfqq
|
||||
* expires, if it still has in-flight requests.
|
||||
*/
|
||||
if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
|
||||
RB_EMPTY_ROOT(&bfqq->sort_list))
|
||||
RB_EMPTY_ROOT(&bfqq->sort_list) &&
|
||||
bfqq->wr_coeff != bfqd->bfq_wr_coeff)
|
||||
bfqq->soft_rt_next_start =
|
||||
bfq_bfqq_softrt_next_start(bfqd, bfqq);
|
||||
|
||||
|
@ -501,10 +501,11 @@ struct bfq_data {
|
||||
unsigned int num_groups_with_pending_reqs;
|
||||
|
||||
/*
|
||||
* Number of bfq_queues containing requests (including the
|
||||
* queue in service, even if it is idling).
|
||||
* Per-class (RT, BE, IDLE) number of bfq_queues containing
|
||||
* requests (including the queue in service, even if it is
|
||||
* idling).
|
||||
*/
|
||||
int busy_queues;
|
||||
unsigned int busy_queues[3];
|
||||
/* number of weight-raised busy @bfq_queues */
|
||||
int wr_busy_queues;
|
||||
/* number of queued requests */
|
||||
@ -537,6 +538,9 @@ struct bfq_data {
|
||||
/* on-disk position of the last served request */
|
||||
sector_t last_position;
|
||||
|
||||
/* position of the last served request for the in-service queue */
|
||||
sector_t in_serv_last_pos;
|
||||
|
||||
/* time of last request completion (ns) */
|
||||
u64 last_completion;
|
||||
|
||||
@ -974,6 +978,7 @@ extern struct blkcg_policy blkcg_policy_bfq;
|
||||
|
||||
struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
|
||||
struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
|
||||
unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
|
||||
struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
|
||||
struct bfq_entity *bfq_entity_of(struct rb_node *node);
|
||||
unsigned short bfq_ioprio_to_weight(int ioprio);
|
||||
|
@ -44,6 +44,12 @@ static unsigned int bfq_class_idx(struct bfq_entity *entity)
|
||||
BFQ_DEFAULT_GRP_CLASS - 1;
|
||||
}
|
||||
|
||||
unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd)
|
||||
{
|
||||
return bfqd->busy_queues[0] + bfqd->busy_queues[1] +
|
||||
bfqd->busy_queues[2];
|
||||
}
|
||||
|
||||
static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
|
||||
bool expiration);
|
||||
|
||||
@ -1513,7 +1519,7 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
|
||||
struct bfq_sched_data *sd;
|
||||
struct bfq_queue *bfqq;
|
||||
|
||||
if (bfqd->busy_queues == 0)
|
||||
if (bfq_tot_busy_queues(bfqd) == 0)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
@ -1665,10 +1671,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
|
||||
bfq_clear_bfqq_busy(bfqq);
|
||||
|
||||
bfqd->busy_queues--;
|
||||
|
||||
if (!bfqq->dispatched)
|
||||
bfq_weights_tree_remove(bfqd, bfqq);
|
||||
bfqd->busy_queues[bfqq->ioprio_class - 1]--;
|
||||
|
||||
if (bfqq->wr_coeff > 1)
|
||||
bfqd->wr_busy_queues--;
|
||||
@ -1676,6 +1679,9 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
bfqg_stats_update_dequeue(bfqq_group(bfqq));
|
||||
|
||||
bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
|
||||
|
||||
if (!bfqq->dispatched)
|
||||
bfq_weights_tree_remove(bfqd, bfqq);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1688,7 +1694,7 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
||||
bfq_activate_bfqq(bfqd, bfqq);
|
||||
|
||||
bfq_mark_bfqq_busy(bfqq);
|
||||
bfqd->busy_queues++;
|
||||
bfqd->busy_queues[bfqq->ioprio_class - 1]++;
|
||||
|
||||
if (!bfqq->dispatched)
|
||||
if (bfqq->wr_coeff == 1)
|
||||
|
49
block/bio.c
49
block/bio.c
@ -753,6 +753,8 @@ EXPORT_SYMBOL(bio_add_pc_page);
|
||||
* @page: page to add
|
||||
* @len: length of the data to add
|
||||
* @off: offset of the data in @page
|
||||
* @same_page: if %true only merge if the new data is in the same physical
|
||||
* page as the last segment of the bio.
|
||||
*
|
||||
* Try to add the data at @page + @off to the last bvec of @bio. This is a
|
||||
* a useful optimisation for file systems with a block size smaller than the
|
||||
@ -761,19 +763,25 @@ EXPORT_SYMBOL(bio_add_pc_page);
|
||||
* Return %true on success or %false on failure.
|
||||
*/
|
||||
bool __bio_try_merge_page(struct bio *bio, struct page *page,
|
||||
unsigned int len, unsigned int off)
|
||||
unsigned int len, unsigned int off, bool same_page)
|
||||
{
|
||||
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
|
||||
return false;
|
||||
|
||||
if (bio->bi_vcnt > 0) {
|
||||
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
|
||||
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
|
||||
bv->bv_offset + bv->bv_len - 1;
|
||||
phys_addr_t page_addr = page_to_phys(page);
|
||||
|
||||
if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
|
||||
bv->bv_len += len;
|
||||
bio->bi_iter.bi_size += len;
|
||||
return true;
|
||||
}
|
||||
if (vec_end_addr + 1 != page_addr + off)
|
||||
return false;
|
||||
if (same_page && (vec_end_addr & PAGE_MASK) != page_addr)
|
||||
return false;
|
||||
|
||||
bv->bv_len += len;
|
||||
bio->bi_iter.bi_size += len;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -819,7 +827,7 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
|
||||
int bio_add_page(struct bio *bio, struct page *page,
|
||||
unsigned int len, unsigned int offset)
|
||||
{
|
||||
if (!__bio_try_merge_page(bio, page, len, offset)) {
|
||||
if (!__bio_try_merge_page(bio, page, len, offset, false)) {
|
||||
if (bio_full(bio))
|
||||
return 0;
|
||||
__bio_add_page(bio, page, len, offset);
|
||||
@ -1072,8 +1080,9 @@ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
|
||||
{
|
||||
int i;
|
||||
struct bio_vec *bvec;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
ssize_t ret;
|
||||
|
||||
ret = copy_page_from_iter(bvec->bv_page,
|
||||
@ -1103,8 +1112,9 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
|
||||
{
|
||||
int i;
|
||||
struct bio_vec *bvec;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
ssize_t ret;
|
||||
|
||||
ret = copy_page_to_iter(bvec->bv_page,
|
||||
@ -1126,8 +1136,9 @@ void bio_free_pages(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bvec;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i)
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all)
|
||||
__free_page(bvec->bv_page);
|
||||
}
|
||||
EXPORT_SYMBOL(bio_free_pages);
|
||||
@ -1295,6 +1306,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
|
||||
struct bio *bio;
|
||||
int ret;
|
||||
struct bio_vec *bvec;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
if (!iov_iter_count(iter))
|
||||
return ERR_PTR(-EINVAL);
|
||||
@ -1368,7 +1380,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
|
||||
return bio;
|
||||
|
||||
out_unmap:
|
||||
bio_for_each_segment_all(bvec, bio, j) {
|
||||
bio_for_each_segment_all(bvec, bio, j, iter_all) {
|
||||
put_page(bvec->bv_page);
|
||||
}
|
||||
bio_put(bio);
|
||||
@ -1379,11 +1391,12 @@ static void __bio_unmap_user(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bvec;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
/*
|
||||
* make sure we dirty pages we wrote to
|
||||
*/
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
if (bio_data_dir(bio) == READ)
|
||||
set_page_dirty_lock(bvec->bv_page);
|
||||
|
||||
@ -1475,8 +1488,9 @@ static void bio_copy_kern_endio_read(struct bio *bio)
|
||||
char *p = bio->bi_private;
|
||||
struct bio_vec *bvec;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
|
||||
p += bvec->bv_len;
|
||||
}
|
||||
@ -1585,8 +1599,9 @@ void bio_set_pages_dirty(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bvec;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
if (!PageCompound(bvec->bv_page))
|
||||
set_page_dirty_lock(bvec->bv_page);
|
||||
}
|
||||
@ -1596,8 +1611,9 @@ static void bio_release_pages(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bvec;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i)
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all)
|
||||
put_page(bvec->bv_page);
|
||||
}
|
||||
|
||||
@ -1644,8 +1660,9 @@ void bio_check_pages_dirty(struct bio *bio)
|
||||
struct bio_vec *bvec;
|
||||
unsigned long flags;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
|
||||
goto defer;
|
||||
}
|
||||
|
@ -1269,7 +1269,7 @@ void blkcg_drain_queue(struct request_queue *q)
|
||||
* blkcg_exit_queue - exit and release blkcg part of request_queue
|
||||
* @q: request_queue being released
|
||||
*
|
||||
* Called from blk_release_queue(). Responsible for exiting blkcg part.
|
||||
* Called from blk_exit_queue(). Responsible for exiting blkcg part.
|
||||
*/
|
||||
void blkcg_exit_queue(struct request_queue *q)
|
||||
{
|
||||
|
@ -161,6 +161,73 @@ static inline unsigned get_max_io_size(struct request_queue *q,
|
||||
return sectors;
|
||||
}
|
||||
|
||||
static unsigned get_max_segment_size(struct request_queue *q,
|
||||
unsigned offset)
|
||||
{
|
||||
unsigned long mask = queue_segment_boundary(q);
|
||||
|
||||
/* default segment boundary mask means no boundary limit */
|
||||
if (mask == BLK_SEG_BOUNDARY_MASK)
|
||||
return queue_max_segment_size(q);
|
||||
|
||||
return min_t(unsigned long, mask - (mask & offset) + 1,
|
||||
queue_max_segment_size(q));
|
||||
}
|
||||
|
||||
/*
|
||||
* Split the bvec @bv into segments, and update all kinds of
|
||||
* variables.
|
||||
*/
|
||||
static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
|
||||
unsigned *nsegs, unsigned *last_seg_size,
|
||||
unsigned *front_seg_size, unsigned *sectors)
|
||||
{
|
||||
unsigned len = bv->bv_len;
|
||||
unsigned total_len = 0;
|
||||
unsigned new_nsegs = 0, seg_size = 0;
|
||||
|
||||
/*
|
||||
* Multi-page bvec may be too big to hold in one segment, so the
|
||||
* current bvec has to be splitted as multiple segments.
|
||||
*/
|
||||
while (len && new_nsegs + *nsegs < queue_max_segments(q)) {
|
||||
seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
|
||||
seg_size = min(seg_size, len);
|
||||
|
||||
new_nsegs++;
|
||||
total_len += seg_size;
|
||||
len -= seg_size;
|
||||
|
||||
if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
|
||||
break;
|
||||
}
|
||||
|
||||
if (!new_nsegs)
|
||||
return !!len;
|
||||
|
||||
/* update front segment size */
|
||||
if (!*nsegs) {
|
||||
unsigned first_seg_size;
|
||||
|
||||
if (new_nsegs == 1)
|
||||
first_seg_size = get_max_segment_size(q, bv->bv_offset);
|
||||
else
|
||||
first_seg_size = queue_max_segment_size(q);
|
||||
|
||||
if (*front_seg_size < first_seg_size)
|
||||
*front_seg_size = first_seg_size;
|
||||
}
|
||||
|
||||
/* update other varibles */
|
||||
*last_seg_size = seg_size;
|
||||
*nsegs += new_nsegs;
|
||||
if (sectors)
|
||||
*sectors += total_len >> 9;
|
||||
|
||||
/* split in the middle of the bvec if len != 0 */
|
||||
return !!len;
|
||||
}
|
||||
|
||||
static struct bio *blk_bio_segment_split(struct request_queue *q,
|
||||
struct bio *bio,
|
||||
struct bio_set *bs,
|
||||
@ -174,7 +241,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
|
||||
struct bio *new = NULL;
|
||||
const unsigned max_sectors = get_max_io_size(q, bio);
|
||||
|
||||
bio_for_each_segment(bv, bio, iter) {
|
||||
bio_for_each_bvec(bv, bio, iter) {
|
||||
/*
|
||||
* If the queue doesn't support SG gaps and adding this
|
||||
* offset would create a gap, disallow it.
|
||||
@ -189,8 +256,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
|
||||
*/
|
||||
if (nsegs < queue_max_segments(q) &&
|
||||
sectors < max_sectors) {
|
||||
nsegs++;
|
||||
sectors = max_sectors;
|
||||
/* split in the middle of bvec */
|
||||
bv.bv_len = (max_sectors - sectors) << 9;
|
||||
bvec_split_segs(q, &bv, &nsegs,
|
||||
&seg_size,
|
||||
&front_seg_size,
|
||||
§ors);
|
||||
}
|
||||
goto split;
|
||||
}
|
||||
@ -206,21 +277,28 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
|
||||
bvprvp = &bvprv;
|
||||
sectors += bv.bv_len >> 9;
|
||||
|
||||
if (nsegs == 1 && seg_size > front_seg_size)
|
||||
front_seg_size = seg_size;
|
||||
|
||||
continue;
|
||||
}
|
||||
new_segment:
|
||||
if (nsegs == queue_max_segments(q))
|
||||
goto split;
|
||||
|
||||
if (nsegs == 1 && seg_size > front_seg_size)
|
||||
front_seg_size = seg_size;
|
||||
|
||||
nsegs++;
|
||||
bvprv = bv;
|
||||
bvprvp = &bvprv;
|
||||
seg_size = bv.bv_len;
|
||||
sectors += bv.bv_len >> 9;
|
||||
|
||||
if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
|
||||
nsegs++;
|
||||
seg_size = bv.bv_len;
|
||||
sectors += bv.bv_len >> 9;
|
||||
if (nsegs == 1 && seg_size > front_seg_size)
|
||||
front_seg_size = seg_size;
|
||||
} else if (bvec_split_segs(q, &bv, &nsegs, &seg_size,
|
||||
&front_seg_size, §ors)) {
|
||||
goto split;
|
||||
}
|
||||
}
|
||||
|
||||
do_split = false;
|
||||
@ -233,8 +311,6 @@ split:
|
||||
bio = new;
|
||||
}
|
||||
|
||||
if (nsegs == 1 && seg_size > front_seg_size)
|
||||
front_seg_size = seg_size;
|
||||
bio->bi_seg_front_size = front_seg_size;
|
||||
if (seg_size > bio->bi_seg_back_size)
|
||||
bio->bi_seg_back_size = seg_size;
|
||||
@ -291,18 +367,20 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
|
||||
EXPORT_SYMBOL(blk_queue_split);
|
||||
|
||||
static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
|
||||
struct bio *bio,
|
||||
bool no_sg_merge)
|
||||
struct bio *bio)
|
||||
{
|
||||
struct bio_vec bv, bvprv = { NULL };
|
||||
int prev = 0;
|
||||
unsigned int seg_size, nr_phys_segs;
|
||||
unsigned front_seg_size;
|
||||
struct bio *fbio, *bbio;
|
||||
struct bvec_iter iter;
|
||||
|
||||
if (!bio)
|
||||
return 0;
|
||||
|
||||
front_seg_size = bio->bi_seg_front_size;
|
||||
|
||||
switch (bio_op(bio)) {
|
||||
case REQ_OP_DISCARD:
|
||||
case REQ_OP_SECURE_ERASE:
|
||||
@ -316,14 +394,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
|
||||
seg_size = 0;
|
||||
nr_phys_segs = 0;
|
||||
for_each_bio(bio) {
|
||||
bio_for_each_segment(bv, bio, iter) {
|
||||
/*
|
||||
* If SG merging is disabled, each bio vector is
|
||||
* a segment
|
||||
*/
|
||||
if (no_sg_merge)
|
||||
goto new_segment;
|
||||
|
||||
bio_for_each_bvec(bv, bio, iter) {
|
||||
if (prev) {
|
||||
if (seg_size + bv.bv_len
|
||||
> queue_max_segment_size(q))
|
||||
@ -333,23 +404,23 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
|
||||
|
||||
seg_size += bv.bv_len;
|
||||
bvprv = bv;
|
||||
|
||||
if (nr_phys_segs == 1 && seg_size >
|
||||
front_seg_size)
|
||||
front_seg_size = seg_size;
|
||||
|
||||
continue;
|
||||
}
|
||||
new_segment:
|
||||
if (nr_phys_segs == 1 && seg_size >
|
||||
fbio->bi_seg_front_size)
|
||||
fbio->bi_seg_front_size = seg_size;
|
||||
|
||||
nr_phys_segs++;
|
||||
bvprv = bv;
|
||||
prev = 1;
|
||||
seg_size = bv.bv_len;
|
||||
bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size,
|
||||
&front_seg_size, NULL);
|
||||
}
|
||||
bbio = bio;
|
||||
}
|
||||
|
||||
if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size)
|
||||
fbio->bi_seg_front_size = seg_size;
|
||||
fbio->bi_seg_front_size = front_seg_size;
|
||||
if (seg_size > bbio->bi_seg_back_size)
|
||||
bbio->bi_seg_back_size = seg_size;
|
||||
|
||||
@ -358,33 +429,16 @@ new_segment:
|
||||
|
||||
void blk_recalc_rq_segments(struct request *rq)
|
||||
{
|
||||
bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
|
||||
&rq->q->queue_flags);
|
||||
|
||||
rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
|
||||
no_sg_merge);
|
||||
rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
|
||||
}
|
||||
|
||||
void blk_recount_segments(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
unsigned short seg_cnt;
|
||||
struct bio *nxt = bio->bi_next;
|
||||
|
||||
/* estimate segment number by bi_vcnt for non-cloned bio */
|
||||
if (bio_flagged(bio, BIO_CLONED))
|
||||
seg_cnt = bio_segments(bio);
|
||||
else
|
||||
seg_cnt = bio->bi_vcnt;
|
||||
|
||||
if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
|
||||
(seg_cnt < queue_max_segments(q)))
|
||||
bio->bi_phys_segments = seg_cnt;
|
||||
else {
|
||||
struct bio *nxt = bio->bi_next;
|
||||
|
||||
bio->bi_next = NULL;
|
||||
bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
|
||||
bio->bi_next = nxt;
|
||||
}
|
||||
bio->bi_next = NULL;
|
||||
bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
|
||||
bio->bi_next = nxt;
|
||||
|
||||
bio_set_flag(bio, BIO_SEG_VALID);
|
||||
}
|
||||
@ -407,6 +461,54 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
|
||||
return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
|
||||
}
|
||||
|
||||
static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
|
||||
struct scatterlist *sglist)
|
||||
{
|
||||
if (!*sg)
|
||||
return sglist;
|
||||
|
||||
/*
|
||||
* If the driver previously mapped a shorter list, we could see a
|
||||
* termination bit prematurely unless it fully inits the sg table
|
||||
* on each mapping. We KNOW that there must be more entries here
|
||||
* or the driver would be buggy, so force clear the termination bit
|
||||
* to avoid doing a full sg_init_table() in drivers for each command.
|
||||
*/
|
||||
sg_unmark_end(*sg);
|
||||
return sg_next(*sg);
|
||||
}
|
||||
|
||||
static unsigned blk_bvec_map_sg(struct request_queue *q,
|
||||
struct bio_vec *bvec, struct scatterlist *sglist,
|
||||
struct scatterlist **sg)
|
||||
{
|
||||
unsigned nbytes = bvec->bv_len;
|
||||
unsigned nsegs = 0, total = 0, offset = 0;
|
||||
|
||||
while (nbytes > 0) {
|
||||
unsigned seg_size;
|
||||
struct page *pg;
|
||||
unsigned idx;
|
||||
|
||||
*sg = blk_next_sg(sg, sglist);
|
||||
|
||||
seg_size = get_max_segment_size(q, bvec->bv_offset + total);
|
||||
seg_size = min(nbytes, seg_size);
|
||||
|
||||
offset = (total + bvec->bv_offset) % PAGE_SIZE;
|
||||
idx = (total + bvec->bv_offset) / PAGE_SIZE;
|
||||
pg = bvec_nth_page(bvec->bv_page, idx);
|
||||
|
||||
sg_set_page(*sg, pg, seg_size, offset);
|
||||
|
||||
total += seg_size;
|
||||
nbytes -= seg_size;
|
||||
nsegs++;
|
||||
}
|
||||
|
||||
return nsegs;
|
||||
}
|
||||
|
||||
static inline void
|
||||
__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
|
||||
struct scatterlist *sglist, struct bio_vec *bvprv,
|
||||
@ -424,25 +526,12 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
|
||||
(*sg)->length += nbytes;
|
||||
} else {
|
||||
new_segment:
|
||||
if (!*sg)
|
||||
*sg = sglist;
|
||||
else {
|
||||
/*
|
||||
* If the driver previously mapped a shorter
|
||||
* list, we could see a termination bit
|
||||
* prematurely unless it fully inits the sg
|
||||
* table on each mapping. We KNOW that there
|
||||
* must be more entries here or the driver
|
||||
* would be buggy, so force clear the
|
||||
* termination bit to avoid doing a full
|
||||
* sg_init_table() in drivers for each command.
|
||||
*/
|
||||
sg_unmark_end(*sg);
|
||||
*sg = sg_next(*sg);
|
||||
}
|
||||
|
||||
sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
|
||||
(*nsegs)++;
|
||||
if (bvec->bv_offset + bvec->bv_len <= PAGE_SIZE) {
|
||||
*sg = blk_next_sg(sg, sglist);
|
||||
sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
|
||||
(*nsegs) += 1;
|
||||
} else
|
||||
(*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg);
|
||||
}
|
||||
*bvprv = *bvec;
|
||||
}
|
||||
@ -464,7 +553,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
|
||||
int nsegs = 0;
|
||||
|
||||
for_each_bio(bio)
|
||||
bio_for_each_segment(bvec, bio, iter)
|
||||
bio_for_each_bvec(bvec, bio, iter)
|
||||
__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
|
||||
&nsegs);
|
||||
|
||||
|
@ -128,11 +128,9 @@ static const char *const blk_queue_flag_name[] = {
|
||||
QUEUE_FLAG_NAME(SAME_FORCE),
|
||||
QUEUE_FLAG_NAME(DEAD),
|
||||
QUEUE_FLAG_NAME(INIT_DONE),
|
||||
QUEUE_FLAG_NAME(NO_SG_MERGE),
|
||||
QUEUE_FLAG_NAME(POLL),
|
||||
QUEUE_FLAG_NAME(WC),
|
||||
QUEUE_FLAG_NAME(FUA),
|
||||
QUEUE_FLAG_NAME(FLUSH_NQ),
|
||||
QUEUE_FLAG_NAME(DAX),
|
||||
QUEUE_FLAG_NAME(STATS),
|
||||
QUEUE_FLAG_NAME(POLL_STATS),
|
||||
@ -251,7 +249,6 @@ static const char *const alloc_policy_name[] = {
|
||||
static const char *const hctx_flag_name[] = {
|
||||
HCTX_FLAG_NAME(SHOULD_MERGE),
|
||||
HCTX_FLAG_NAME(TAG_SHARED),
|
||||
HCTX_FLAG_NAME(SG_MERGE),
|
||||
HCTX_FLAG_NAME(BLOCKING),
|
||||
HCTX_FLAG_NAME(NO_SCHED),
|
||||
};
|
||||
|
@ -321,7 +321,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
struct elevator_queue *e = q->elevator;
|
||||
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
|
||||
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu);
|
||||
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
|
||||
bool ret = false;
|
||||
enum hctx_type type;
|
||||
|
||||
|
@ -170,7 +170,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
|
||||
|
||||
data->ctx = blk_mq_get_ctx(data->q);
|
||||
data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
|
||||
data->ctx->cpu);
|
||||
data->ctx);
|
||||
tags = blk_mq_tags_from_data(data);
|
||||
if (data->flags & BLK_MQ_REQ_RESERVED)
|
||||
bt = &tags->breserved_tags;
|
||||
|
@ -364,7 +364,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
|
||||
}
|
||||
if (likely(!data->hctx))
|
||||
data->hctx = blk_mq_map_queue(q, data->cmd_flags,
|
||||
data->ctx->cpu);
|
||||
data->ctx);
|
||||
if (data->cmd_flags & REQ_NOWAIT)
|
||||
data->flags |= BLK_MQ_REQ_NOWAIT;
|
||||
|
||||
@ -2069,7 +2069,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
|
||||
struct blk_mq_tags *tags;
|
||||
int node;
|
||||
|
||||
node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
|
||||
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
|
||||
if (node == NUMA_NO_NODE)
|
||||
node = set->numa_node;
|
||||
|
||||
@ -2125,7 +2125,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
|
||||
size_t rq_size, left;
|
||||
int node;
|
||||
|
||||
node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
|
||||
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
|
||||
if (node == NUMA_NO_NODE)
|
||||
node = set->numa_node;
|
||||
|
||||
@ -2424,7 +2424,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
|
||||
* If the cpu isn't present, the cpu is mapped to first hctx.
|
||||
*/
|
||||
for_each_possible_cpu(i) {
|
||||
hctx_idx = set->map[0].mq_map[i];
|
||||
hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
|
||||
/* unmapped hw queue can be remapped after CPU topo changed */
|
||||
if (!set->tags[hctx_idx] &&
|
||||
!__blk_mq_alloc_rq_map(set, hctx_idx)) {
|
||||
@ -2434,16 +2434,19 @@ static void blk_mq_map_swqueue(struct request_queue *q)
|
||||
* case, remap the current ctx to hctx[0] which
|
||||
* is guaranteed to always have tags allocated
|
||||
*/
|
||||
set->map[0].mq_map[i] = 0;
|
||||
set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
|
||||
}
|
||||
|
||||
ctx = per_cpu_ptr(q->queue_ctx, i);
|
||||
for (j = 0; j < set->nr_maps; j++) {
|
||||
if (!set->map[j].nr_queues)
|
||||
if (!set->map[j].nr_queues) {
|
||||
ctx->hctxs[j] = blk_mq_map_queue_type(q,
|
||||
HCTX_TYPE_DEFAULT, i);
|
||||
continue;
|
||||
}
|
||||
|
||||
hctx = blk_mq_map_queue_type(q, j, i);
|
||||
|
||||
ctx->hctxs[j] = hctx;
|
||||
/*
|
||||
* If the CPU is already set in the mask, then we've
|
||||
* mapped this one already. This can happen if
|
||||
@ -2463,6 +2466,10 @@ static void blk_mq_map_swqueue(struct request_queue *q)
|
||||
*/
|
||||
BUG_ON(!hctx->nr_ctx);
|
||||
}
|
||||
|
||||
for (; j < HCTX_MAX_TYPES; j++)
|
||||
ctx->hctxs[j] = blk_mq_map_queue_type(q,
|
||||
HCTX_TYPE_DEFAULT, i);
|
||||
}
|
||||
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
@ -2734,7 +2741,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
|
||||
int node;
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
|
||||
node = blk_mq_hw_queue_to_node(&set->map[0], i);
|
||||
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
|
||||
/*
|
||||
* If the hw queue has been mapped to another numa node,
|
||||
* we need to realloc the hctx. If allocation fails, fallback
|
||||
@ -2838,9 +2845,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
||||
set->map[HCTX_TYPE_POLL].nr_queues)
|
||||
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
|
||||
|
||||
if (!(set->flags & BLK_MQ_F_SG_MERGE))
|
||||
blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
|
||||
|
||||
q->sg_reserved_size = INT_MAX;
|
||||
|
||||
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
|
||||
@ -2968,7 +2972,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
|
||||
return set->ops->map_queues(set);
|
||||
} else {
|
||||
BUG_ON(set->nr_maps > 1);
|
||||
return blk_mq_map_queues(&set->map[0]);
|
||||
return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3090,6 +3094,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
|
||||
if (!set)
|
||||
return -EINVAL;
|
||||
|
||||
if (q->nr_requests == nr)
|
||||
return 0;
|
||||
|
||||
blk_mq_freeze_queue(q);
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
@ -3235,7 +3242,7 @@ fallback:
|
||||
pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
|
||||
nr_hw_queues, prev_nr_hw_queues);
|
||||
set->nr_hw_queues = prev_nr_hw_queues;
|
||||
blk_mq_map_queues(&set->map[0]);
|
||||
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
|
||||
goto fallback;
|
||||
}
|
||||
blk_mq_map_swqueue(q);
|
||||
|
@ -23,6 +23,7 @@ struct blk_mq_ctx {
|
||||
|
||||
unsigned int cpu;
|
||||
unsigned short index_hw[HCTX_MAX_TYPES];
|
||||
struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES];
|
||||
|
||||
/* incremented at dispatch time */
|
||||
unsigned long rq_dispatched[2];
|
||||
@ -96,26 +97,23 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
|
||||
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
|
||||
* @q: request queue
|
||||
* @flags: request command flags
|
||||
* @cpu: CPU
|
||||
* @cpu: cpu ctx
|
||||
*/
|
||||
static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
|
||||
unsigned int flags,
|
||||
unsigned int cpu)
|
||||
struct blk_mq_ctx *ctx)
|
||||
{
|
||||
enum hctx_type type = HCTX_TYPE_DEFAULT;
|
||||
|
||||
if ((flags & REQ_HIPRI) &&
|
||||
q->tag_set->nr_maps > HCTX_TYPE_POLL &&
|
||||
q->tag_set->map[HCTX_TYPE_POLL].nr_queues &&
|
||||
test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
|
||||
/*
|
||||
* The caller ensure that if REQ_HIPRI, poll must be enabled.
|
||||
*/
|
||||
if (flags & REQ_HIPRI)
|
||||
type = HCTX_TYPE_POLL;
|
||||
|
||||
else if (((flags & REQ_OP_MASK) == REQ_OP_READ) &&
|
||||
q->tag_set->nr_maps > HCTX_TYPE_READ &&
|
||||
q->tag_set->map[HCTX_TYPE_READ].nr_queues)
|
||||
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
|
||||
type = HCTX_TYPE_READ;
|
||||
|
||||
return blk_mq_map_queue_type(q, type, cpu);
|
||||
return ctx->hctxs[type];
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -799,15 +799,6 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
|
||||
}
|
||||
EXPORT_SYMBOL(blk_queue_update_dma_alignment);
|
||||
|
||||
void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
|
||||
{
|
||||
if (queueable)
|
||||
blk_queue_flag_clear(QUEUE_FLAG_FLUSH_NQ, q);
|
||||
else
|
||||
blk_queue_flag_set(QUEUE_FLAG_FLUSH_NQ, q);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
|
||||
|
||||
/**
|
||||
* blk_set_queue_depth - tell the block layer about the device queue depth
|
||||
* @q: the request queue for the device
|
||||
|
@ -468,6 +468,9 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
|
||||
else if (val >= 0)
|
||||
val *= 1000ULL;
|
||||
|
||||
if (wbt_get_min_lat(q) == val)
|
||||
return count;
|
||||
|
||||
/*
|
||||
* Ensure that the queue is idled, in case the latency update
|
||||
* ends up either enabling or disabling wbt completely. We can't
|
||||
@ -817,21 +820,16 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
|
||||
}
|
||||
|
||||
/**
|
||||
* __blk_release_queue - release a request queue when it is no longer needed
|
||||
* __blk_release_queue - release a request queue
|
||||
* @work: pointer to the release_work member of the request queue to be released
|
||||
*
|
||||
* Description:
|
||||
* blk_release_queue is the counterpart of blk_init_queue(). It should be
|
||||
* called when a request queue is being released; typically when a block
|
||||
* device is being de-registered. Its primary task it to free the queue
|
||||
* itself.
|
||||
*
|
||||
* Notes:
|
||||
* The low level driver must have finished any outstanding requests first
|
||||
* via blk_cleanup_queue().
|
||||
*
|
||||
* Although blk_release_queue() may be called with preemption disabled,
|
||||
* __blk_release_queue() may sleep.
|
||||
* This function is called when a block device is being unregistered. The
|
||||
* process of releasing a request queue starts with blk_cleanup_queue, which
|
||||
* set the appropriate flags and then calls blk_put_queue, that decrements
|
||||
* the reference counter of the request queue. Once the reference counter
|
||||
* of the request queue reaches zero, blk_release_queue is called to release
|
||||
* all allocated resources of the request queue.
|
||||
*/
|
||||
static void __blk_release_queue(struct work_struct *work)
|
||||
{
|
||||
|
@ -38,7 +38,7 @@ extern struct ida blk_queue_ida;
|
||||
static inline struct blk_flush_queue *
|
||||
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
|
||||
{
|
||||
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx->cpu)->fq;
|
||||
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
|
||||
}
|
||||
|
||||
static inline void __blk_get_queue(struct request_queue *q)
|
||||
|
@ -165,11 +165,12 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
|
||||
struct bio_vec *bvec, orig_vec;
|
||||
int i;
|
||||
struct bvec_iter orig_iter = bio_orig->bi_iter;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
/*
|
||||
* free up bounce indirect pages used
|
||||
*/
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
orig_vec = bio_iter_iovec(bio_orig, orig_iter);
|
||||
if (bvec->bv_page != orig_vec.bv_page) {
|
||||
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
|
||||
@ -313,7 +314,12 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
|
||||
bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
|
||||
&bounce_bio_set);
|
||||
|
||||
bio_for_each_segment_all(to, bio, i) {
|
||||
/*
|
||||
* Bvec table can't be updated by bio_for_each_segment_all(),
|
||||
* so retrieve bvec from the table directly. This way is safe
|
||||
* because the 'bio' is single-page bvec.
|
||||
*/
|
||||
for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
|
||||
struct page *page = to->bv_page;
|
||||
|
||||
if (page_to_pfn(page) <= q->limits.bounce_pfn)
|
||||
|
@ -667,8 +667,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
|
||||
/*
|
||||
* Special case for mq, turn off scheduling
|
||||
*/
|
||||
if (!strncmp(name, "none", 4))
|
||||
if (!strncmp(name, "none", 4)) {
|
||||
if (!q->elevator)
|
||||
return 0;
|
||||
return elevator_switch(q, NULL);
|
||||
}
|
||||
|
||||
strlcpy(elevator_name, name, sizeof(elevator_name));
|
||||
e = elevator_get(q, strstrip(elevator_name), true);
|
||||
|
@ -365,8 +365,8 @@ int register_blkdev(unsigned int major, const char *name)
|
||||
}
|
||||
|
||||
if (index == 0) {
|
||||
printk("register_blkdev: failed to get major for %s\n",
|
||||
name);
|
||||
printk("%s: failed to get major for %s\n",
|
||||
__func__, name);
|
||||
ret = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
@ -375,8 +375,8 @@ int register_blkdev(unsigned int major, const char *name)
|
||||
}
|
||||
|
||||
if (major >= BLKDEV_MAJOR_MAX) {
|
||||
pr_err("register_blkdev: major requested (%u) is greater than the maximum (%u) for %s\n",
|
||||
major, BLKDEV_MAJOR_MAX-1, name);
|
||||
pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
|
||||
__func__, major, BLKDEV_MAJOR_MAX-1, name);
|
||||
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
@ -655,10 +655,12 @@ exit:
|
||||
kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
|
||||
disk_part_iter_exit(&piter);
|
||||
|
||||
err = sysfs_create_link(&ddev->kobj,
|
||||
&disk->queue->backing_dev_info->dev->kobj,
|
||||
"bdi");
|
||||
WARN_ON(err);
|
||||
if (disk->queue->backing_dev_info->dev) {
|
||||
err = sysfs_create_link(&ddev->kobj,
|
||||
&disk->queue->backing_dev_info->dev->kobj,
|
||||
"bdi");
|
||||
WARN_ON(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1318,8 +1318,6 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
|
||||
scsi_change_queue_depth(sdev, depth);
|
||||
}
|
||||
|
||||
blk_queue_flush_queueable(q, false);
|
||||
|
||||
if (dev->flags & ATA_DFLAG_TRUSTED)
|
||||
sdev->security_supported = 1;
|
||||
|
||||
|
@ -2230,7 +2230,6 @@ static void floppy_end_request(struct request *req, blk_status_t error)
|
||||
static void request_done(int uptodate)
|
||||
{
|
||||
struct request *req = current_req;
|
||||
struct request_queue *q;
|
||||
int block;
|
||||
char msg[sizeof("request done ") + sizeof(int) * 3];
|
||||
|
||||
@ -2243,8 +2242,6 @@ static void request_done(int uptodate)
|
||||
return;
|
||||
}
|
||||
|
||||
q = req->q;
|
||||
|
||||
if (uptodate) {
|
||||
/* maintain values for invalidation on geometry
|
||||
* change */
|
||||
|
@ -511,21 +511,22 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
|
||||
loff_t pos, bool rw)
|
||||
{
|
||||
struct iov_iter iter;
|
||||
struct req_iterator rq_iter;
|
||||
struct bio_vec *bvec;
|
||||
struct request *rq = blk_mq_rq_from_pdu(cmd);
|
||||
struct bio *bio = rq->bio;
|
||||
struct file *file = lo->lo_backing_file;
|
||||
struct bio_vec tmp;
|
||||
unsigned int offset;
|
||||
int segments = 0;
|
||||
int nr_bvec = 0;
|
||||
int ret;
|
||||
|
||||
if (rq->bio != rq->biotail) {
|
||||
struct req_iterator iter;
|
||||
struct bio_vec tmp;
|
||||
rq_for_each_bvec(tmp, rq, rq_iter)
|
||||
nr_bvec++;
|
||||
|
||||
__rq_for_each_bio(bio, rq)
|
||||
segments += bio_segments(bio);
|
||||
bvec = kmalloc_array(segments, sizeof(struct bio_vec),
|
||||
if (rq->bio != rq->biotail) {
|
||||
|
||||
bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
|
||||
GFP_NOIO);
|
||||
if (!bvec)
|
||||
return -EIO;
|
||||
@ -534,10 +535,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
|
||||
/*
|
||||
* The bios of the request may be started from the middle of
|
||||
* the 'bvec' because of bio splitting, so we can't directly
|
||||
* copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
|
||||
* copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
|
||||
* API will take care of all details for us.
|
||||
*/
|
||||
rq_for_each_segment(tmp, rq, iter) {
|
||||
rq_for_each_bvec(tmp, rq, rq_iter) {
|
||||
*bvec = tmp;
|
||||
bvec++;
|
||||
}
|
||||
@ -551,11 +552,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
|
||||
*/
|
||||
offset = bio->bi_iter.bi_bvec_done;
|
||||
bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
|
||||
segments = bio_segments(bio);
|
||||
}
|
||||
atomic_set(&cmd->ref, 2);
|
||||
|
||||
iov_iter_bvec(&iter, rw, bvec, segments, blk_rq_bytes(rq));
|
||||
iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
|
||||
iter.iov_offset = offset;
|
||||
|
||||
cmd->iocb.ki_pos = pos;
|
||||
@ -1089,16 +1089,12 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
|
||||
kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
|
||||
}
|
||||
mapping_set_gfp_mask(filp->f_mapping, gfp);
|
||||
lo->lo_state = Lo_unbound;
|
||||
/* This is safe: open() is still holding a reference. */
|
||||
module_put(THIS_MODULE);
|
||||
blk_mq_unfreeze_queue(lo->lo_queue);
|
||||
|
||||
partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
|
||||
lo_number = lo->lo_number;
|
||||
lo->lo_flags = 0;
|
||||
if (!part_shift)
|
||||
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
|
||||
loop_unprepare_queue(lo);
|
||||
out_unlock:
|
||||
mutex_unlock(&loop_ctl_mutex);
|
||||
@ -1115,11 +1111,29 @@ out_unlock:
|
||||
err = __blkdev_reread_part(bdev);
|
||||
else
|
||||
err = blkdev_reread_part(bdev);
|
||||
pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
|
||||
__func__, lo_number, err);
|
||||
if (err)
|
||||
pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
|
||||
__func__, lo_number, err);
|
||||
/* Device is gone, no point in returning error */
|
||||
err = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* lo->lo_state is set to Lo_unbound here after above partscan has
|
||||
* finished.
|
||||
*
|
||||
* There cannot be anybody else entering __loop_clr_fd() as
|
||||
* lo->lo_backing_file is already cleared and Lo_rundown state
|
||||
* protects us from all the other places trying to change the 'lo'
|
||||
* device.
|
||||
*/
|
||||
mutex_lock(&loop_ctl_mutex);
|
||||
lo->lo_flags = 0;
|
||||
if (!part_shift)
|
||||
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
|
||||
lo->lo_state = Lo_unbound;
|
||||
mutex_unlock(&loop_ctl_mutex);
|
||||
|
||||
/*
|
||||
* Need not hold loop_ctl_mutex to fput backing file.
|
||||
* Calling fput holding loop_ctl_mutex triggers a circular
|
||||
@ -1937,7 +1951,7 @@ static int loop_add(struct loop_device **l, int i)
|
||||
lo->tag_set.queue_depth = 128;
|
||||
lo->tag_set.numa_node = NUMA_NO_NODE;
|
||||
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
|
||||
lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
|
||||
lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
lo->tag_set.driver_data = lo;
|
||||
|
||||
err = blk_mq_alloc_tag_set(&lo->tag_set);
|
||||
|
@ -1416,7 +1416,7 @@ static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
|
||||
WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE);
|
||||
|
||||
/* Allocate a DMA buffer for the trim structure */
|
||||
buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
|
||||
buf = dma_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
|
||||
GFP_KERNEL);
|
||||
if (!buf)
|
||||
return BLK_STS_RESOURCE;
|
||||
@ -1453,7 +1453,7 @@ static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
|
||||
MTIP_TRIM_TIMEOUT_MS) < 0)
|
||||
ret = BLK_STS_IOERR;
|
||||
|
||||
dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
|
||||
dma_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1656,7 +1656,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
|
||||
if (!user_buffer)
|
||||
return -EFAULT;
|
||||
|
||||
buf = dmam_alloc_coherent(&port->dd->pdev->dev,
|
||||
buf = dma_alloc_coherent(&port->dd->pdev->dev,
|
||||
ATA_SECT_SIZE * xfer_sz,
|
||||
&dma_addr,
|
||||
GFP_KERNEL);
|
||||
@ -1734,7 +1734,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
|
||||
}
|
||||
exit_drive_command:
|
||||
if (buf)
|
||||
dmam_free_coherent(&port->dd->pdev->dev,
|
||||
dma_free_coherent(&port->dd->pdev->dev,
|
||||
ATA_SECT_SIZE * xfer_sz, buf, dma_addr);
|
||||
return rv;
|
||||
}
|
||||
@ -2838,11 +2838,11 @@ static void mtip_dma_free(struct driver_data *dd)
|
||||
struct mtip_port *port = dd->port;
|
||||
|
||||
if (port->block1)
|
||||
dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
|
||||
dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
|
||||
port->block1, port->block1_dma);
|
||||
|
||||
if (port->command_list) {
|
||||
dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
|
||||
dma_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
|
||||
port->command_list, port->command_list_dma);
|
||||
}
|
||||
}
|
||||
@ -2861,7 +2861,7 @@ static int mtip_dma_alloc(struct driver_data *dd)
|
||||
|
||||
/* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
|
||||
port->block1 =
|
||||
dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
|
||||
dma_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
|
||||
&port->block1_dma, GFP_KERNEL);
|
||||
if (!port->block1)
|
||||
return -ENOMEM;
|
||||
@ -2869,10 +2869,10 @@ static int mtip_dma_alloc(struct driver_data *dd)
|
||||
|
||||
/* Allocate dma memory for command list */
|
||||
port->command_list =
|
||||
dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
|
||||
dma_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
|
||||
&port->command_list_dma, GFP_KERNEL);
|
||||
if (!port->command_list) {
|
||||
dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
|
||||
dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
|
||||
port->block1, port->block1_dma);
|
||||
port->block1 = NULL;
|
||||
port->block1_dma = 0;
|
||||
@ -3057,13 +3057,8 @@ static int mtip_hw_init(struct driver_data *dd)
|
||||
mtip_start_port(dd->port);
|
||||
|
||||
/* Setup the ISR and enable interrupts. */
|
||||
rv = devm_request_irq(&dd->pdev->dev,
|
||||
dd->pdev->irq,
|
||||
mtip_irq_handler,
|
||||
IRQF_SHARED,
|
||||
dev_driver_string(&dd->pdev->dev),
|
||||
dd);
|
||||
|
||||
rv = request_irq(dd->pdev->irq, mtip_irq_handler, IRQF_SHARED,
|
||||
dev_driver_string(&dd->pdev->dev), dd);
|
||||
if (rv) {
|
||||
dev_err(&dd->pdev->dev,
|
||||
"Unable to allocate IRQ %d\n", dd->pdev->irq);
|
||||
@ -3091,7 +3086,7 @@ out3:
|
||||
|
||||
/* Release the IRQ. */
|
||||
irq_set_affinity_hint(dd->pdev->irq, NULL);
|
||||
devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
|
||||
free_irq(dd->pdev->irq, dd);
|
||||
|
||||
out2:
|
||||
mtip_deinit_port(dd->port);
|
||||
@ -3146,7 +3141,7 @@ static int mtip_hw_exit(struct driver_data *dd)
|
||||
|
||||
/* Release the IRQ. */
|
||||
irq_set_affinity_hint(dd->pdev->irq, NULL);
|
||||
devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
|
||||
free_irq(dd->pdev->irq, dd);
|
||||
msleep(1000);
|
||||
|
||||
/* Free dma regions */
|
||||
@ -3610,8 +3605,8 @@ static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq,
|
||||
if (!cmd->command)
|
||||
return;
|
||||
|
||||
dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
|
||||
cmd->command, cmd->command_dma);
|
||||
dma_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, cmd->command,
|
||||
cmd->command_dma);
|
||||
}
|
||||
|
||||
static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
|
||||
@ -3620,7 +3615,7 @@ static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
|
||||
struct driver_data *dd = set->driver_data;
|
||||
struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
|
||||
|
||||
cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
|
||||
cmd->command = dma_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
|
||||
&cmd->command_dma, GFP_KERNEL);
|
||||
if (!cmd->command)
|
||||
return -ENOMEM;
|
||||
|
@ -1571,7 +1571,7 @@ static int nbd_dev_add(int index)
|
||||
nbd->tag_set.numa_node = NUMA_NO_NODE;
|
||||
nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
|
||||
nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
|
||||
BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
|
||||
BLK_MQ_F_BLOCKING;
|
||||
nbd->tag_set.driver_data = nbd;
|
||||
|
||||
err = blk_mq_alloc_tag_set(&nbd->tag_set);
|
||||
@ -2118,8 +2118,7 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
|
||||
}
|
||||
nla_nest_end(reply, dev_list);
|
||||
genlmsg_end(reply, reply_head);
|
||||
genlmsg_reply(reply, info);
|
||||
ret = 0;
|
||||
ret = genlmsg_reply(reply, info);
|
||||
out:
|
||||
mutex_unlock(&nbd_index_mutex);
|
||||
return ret;
|
||||
|
@ -1104,7 +1104,7 @@ static int null_handle_bio(struct nullb_cmd *cmd)
|
||||
len = bvec.bv_len;
|
||||
err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
|
||||
op_is_write(bio_op(bio)), sector,
|
||||
bio_op(bio) & REQ_FUA);
|
||||
bio->bi_opf & REQ_FUA);
|
||||
if (err) {
|
||||
spin_unlock_irq(&nullb->lock);
|
||||
return err;
|
||||
@ -1678,7 +1678,6 @@ static int null_add_dev(struct nullb_device *dev)
|
||||
if (dev->cache_size > 0) {
|
||||
set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
|
||||
blk_queue_write_cache(nullb->q, true, true);
|
||||
blk_queue_flush_queueable(nullb->q, true);
|
||||
}
|
||||
|
||||
if (dev->zoned) {
|
||||
|
@ -3987,7 +3987,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
|
||||
rbd_dev->tag_set.ops = &rbd_mq_ops;
|
||||
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
|
||||
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
|
||||
rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
|
||||
rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
rbd_dev->tag_set.nr_hw_queues = 1;
|
||||
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
|
||||
|
||||
|
@ -2843,7 +2843,6 @@ static int skd_cons_disk(struct skd_device *skdev)
|
||||
skdev->sgs_per_request * sizeof(struct scatterlist);
|
||||
skdev->tag_set.numa_node = NUMA_NO_NODE;
|
||||
skdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
|
||||
BLK_MQ_F_SG_MERGE |
|
||||
BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO);
|
||||
skdev->tag_set.driver_data = skdev;
|
||||
rc = blk_mq_alloc_tag_set(&skdev->tag_set);
|
||||
|
@ -977,7 +977,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
|
||||
} else
|
||||
info->tag_set.queue_depth = BLK_RING_SIZE(info);
|
||||
info->tag_set.numa_node = NUMA_NO_NODE;
|
||||
info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
|
||||
info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
info->tag_set.cmd_size = sizeof(struct blkif_req);
|
||||
info->tag_set.driver_data = info;
|
||||
|
||||
|
@ -265,6 +265,7 @@
|
||||
/* #define ERRLOGMASK (CD_WARNING|CD_OPEN|CD_COUNT_TRACKS|CD_CLOSE) */
|
||||
/* #define ERRLOGMASK (CD_WARNING|CD_REG_UNREG|CD_DO_IOCTL|CD_OPEN|CD_CLOSE|CD_COUNT_TRACKS) */
|
||||
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/major.h>
|
||||
@ -3692,9 +3693,9 @@ static struct ctl_table_header *cdrom_sysctl_header;
|
||||
|
||||
static void cdrom_sysctl_register(void)
|
||||
{
|
||||
static int initialized;
|
||||
static atomic_t initialized = ATOMIC_INIT(0);
|
||||
|
||||
if (initialized == 1)
|
||||
if (!atomic_add_unless(&initialized, 1, 1))
|
||||
return;
|
||||
|
||||
cdrom_sysctl_header = register_sysctl_table(cdrom_root_table);
|
||||
@ -3705,8 +3706,6 @@ static void cdrom_sysctl_register(void)
|
||||
cdrom_sysctl_settings.debug = debug;
|
||||
cdrom_sysctl_settings.lock = lockdoor;
|
||||
cdrom_sysctl_settings.check = check_media_type;
|
||||
|
||||
initialized = 1;
|
||||
}
|
||||
|
||||
static void cdrom_sysctl_unregister(void)
|
||||
|
@ -141,7 +141,7 @@ struct nvm_chk_meta *pblk_get_chunk_meta(struct pblk *pblk)
|
||||
|
||||
ret = nvm_get_chunk_meta(dev, ppa, geo->all_chunks, meta);
|
||||
if (ret) {
|
||||
kfree(meta);
|
||||
vfree(meta);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
@ -1065,7 +1065,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
|
||||
bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len);
|
||||
|
||||
smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC);
|
||||
memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16);
|
||||
guid_copy((guid_t *)&smeta_buf->header.uuid, &pblk->instance_uuid);
|
||||
smeta_buf->header.id = cpu_to_le32(line->id);
|
||||
smeta_buf->header.type = cpu_to_le16(line->type);
|
||||
smeta_buf->header.version_major = SMETA_VERSION_MAJOR;
|
||||
@ -1278,6 +1278,7 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
|
||||
spin_unlock(&line->lock);
|
||||
|
||||
kref_init(&line->ref);
|
||||
atomic_set(&line->sec_to_update, 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -1874,7 +1875,8 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
|
||||
|
||||
if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) {
|
||||
emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC);
|
||||
memcpy(emeta_buf->header.uuid, pblk->instance_uuid, 16);
|
||||
guid_copy((guid_t *)&emeta_buf->header.uuid,
|
||||
&pblk->instance_uuid);
|
||||
emeta_buf->header.id = cpu_to_le32(line->id);
|
||||
emeta_buf->header.type = cpu_to_le16(line->type);
|
||||
emeta_buf->header.version_major = EMETA_VERSION_MAJOR;
|
||||
|
@ -365,16 +365,22 @@ static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk,
|
||||
struct list_head *group_list)
|
||||
{
|
||||
struct pblk_line *line, *victim;
|
||||
int line_vsc, victim_vsc;
|
||||
unsigned int line_vsc = ~0x0L, victim_vsc = ~0x0L;
|
||||
|
||||
victim = list_first_entry(group_list, struct pblk_line, list);
|
||||
|
||||
list_for_each_entry(line, group_list, list) {
|
||||
line_vsc = le32_to_cpu(*line->vsc);
|
||||
victim_vsc = le32_to_cpu(*victim->vsc);
|
||||
if (line_vsc < victim_vsc)
|
||||
if (!atomic_read(&line->sec_to_update))
|
||||
line_vsc = le32_to_cpu(*line->vsc);
|
||||
if (line_vsc < victim_vsc) {
|
||||
victim = line;
|
||||
victim_vsc = le32_to_cpu(*victim->vsc);
|
||||
}
|
||||
}
|
||||
|
||||
if (victim_vsc == ~0x0)
|
||||
return NULL;
|
||||
|
||||
return victim;
|
||||
}
|
||||
|
||||
@ -448,13 +454,13 @@ next_gc_group:
|
||||
|
||||
do {
|
||||
spin_lock(&l_mg->gc_lock);
|
||||
if (list_empty(group_list)) {
|
||||
|
||||
line = pblk_gc_get_victim_line(pblk, group_list);
|
||||
if (!line) {
|
||||
spin_unlock(&l_mg->gc_lock);
|
||||
break;
|
||||
}
|
||||
|
||||
line = pblk_gc_get_victim_line(pblk, group_list);
|
||||
|
||||
spin_lock(&line->lock);
|
||||
WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
|
||||
line->state = PBLK_LINESTATE_GC;
|
||||
|
@ -130,7 +130,7 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
|
||||
struct pblk_line *line = NULL;
|
||||
|
||||
if (factory_init) {
|
||||
pblk_setup_uuid(pblk);
|
||||
guid_gen(&pblk->instance_uuid);
|
||||
} else {
|
||||
line = pblk_recov_l2p(pblk);
|
||||
if (IS_ERR(line)) {
|
||||
@ -584,14 +584,12 @@ static void pblk_lines_free(struct pblk *pblk)
|
||||
struct pblk_line *line;
|
||||
int i;
|
||||
|
||||
spin_lock(&l_mg->free_lock);
|
||||
for (i = 0; i < l_mg->nr_lines; i++) {
|
||||
line = &pblk->lines[i];
|
||||
|
||||
pblk_line_free(line);
|
||||
pblk_line_meta_free(l_mg, line);
|
||||
}
|
||||
spin_unlock(&l_mg->free_lock);
|
||||
|
||||
pblk_line_mg_free(pblk);
|
||||
|
||||
|
@ -73,6 +73,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
|
||||
*/
|
||||
if (i < valid_secs) {
|
||||
kref_get(&line->ref);
|
||||
atomic_inc(&line->sec_to_update);
|
||||
w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
|
||||
w_ctx->ppa = ppa_list[i];
|
||||
meta->lba = cpu_to_le64(w_ctx->lba);
|
||||
|
@ -45,10 +45,23 @@ void pblk_rb_free(struct pblk_rb *rb)
|
||||
/*
|
||||
* pblk_rb_calculate_size -- calculate the size of the write buffer
|
||||
*/
|
||||
static unsigned int pblk_rb_calculate_size(unsigned int nr_entries)
|
||||
static unsigned int pblk_rb_calculate_size(unsigned int nr_entries,
|
||||
unsigned int threshold)
|
||||
{
|
||||
/* Alloc a write buffer that can at least fit 128 entries */
|
||||
return (1 << max(get_count_order(nr_entries), 7));
|
||||
unsigned int thr_sz = 1 << (get_count_order(threshold + NVM_MAX_VLBA));
|
||||
unsigned int max_sz = max(thr_sz, nr_entries);
|
||||
unsigned int max_io;
|
||||
|
||||
/* Alloc a write buffer that can (i) fit at least two split bios
|
||||
* (considering max I/O size NVM_MAX_VLBA, and (ii) guarantee that the
|
||||
* threshold will be respected
|
||||
*/
|
||||
max_io = (1 << max((int)(get_count_order(max_sz)),
|
||||
(int)(get_count_order(NVM_MAX_VLBA << 1))));
|
||||
if ((threshold + NVM_MAX_VLBA) >= max_io)
|
||||
max_io <<= 1;
|
||||
|
||||
return max_io;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -67,12 +80,12 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
|
||||
unsigned int alloc_order, order, iter;
|
||||
unsigned int nr_entries;
|
||||
|
||||
nr_entries = pblk_rb_calculate_size(size);
|
||||
nr_entries = pblk_rb_calculate_size(size, threshold);
|
||||
entries = vzalloc(array_size(nr_entries, sizeof(struct pblk_rb_entry)));
|
||||
if (!entries)
|
||||
return -ENOMEM;
|
||||
|
||||
power_size = get_count_order(size);
|
||||
power_size = get_count_order(nr_entries);
|
||||
power_seg_sz = get_count_order(seg_size);
|
||||
|
||||
down_write(&pblk_rb_lock);
|
||||
@ -149,7 +162,7 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
|
||||
* Initialize rate-limiter, which controls access to the write buffer
|
||||
* by user and GC I/O
|
||||
*/
|
||||
pblk_rl_init(&pblk->rl, rb->nr_entries);
|
||||
pblk_rl_init(&pblk->rl, rb->nr_entries, threshold);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -247,6 +260,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
|
||||
entry->cacheline);
|
||||
|
||||
line = pblk_ppa_to_line(pblk, w_ctx->ppa);
|
||||
atomic_dec(&line->sec_to_update);
|
||||
kref_put(&line->ref, pblk_line_put);
|
||||
clean_wctx(w_ctx);
|
||||
rb->l2p_update = pblk_rb_ptr_wrap(rb, rb->l2p_update, 1);
|
||||
|
@ -302,35 +302,55 @@ static int pblk_pad_distance(struct pblk *pblk, struct pblk_line *line)
|
||||
return (distance > line->left_msecs) ? line->left_msecs : distance;
|
||||
}
|
||||
|
||||
static int pblk_line_wp_is_unbalanced(struct pblk *pblk,
|
||||
struct pblk_line *line)
|
||||
/* Return a chunk belonging to a line by stripe(write order) index */
|
||||
static struct nvm_chk_meta *pblk_get_stripe_chunk(struct pblk *pblk,
|
||||
struct pblk_line *line,
|
||||
int index)
|
||||
{
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
struct pblk_line_meta *lm = &pblk->lm;
|
||||
struct pblk_lun *rlun;
|
||||
struct nvm_chk_meta *chunk;
|
||||
struct ppa_addr ppa;
|
||||
u64 line_wp;
|
||||
int pos, i;
|
||||
int pos;
|
||||
|
||||
rlun = &pblk->luns[0];
|
||||
rlun = &pblk->luns[index];
|
||||
ppa = rlun->bppa;
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
chunk = &line->chks[pos];
|
||||
|
||||
line_wp = chunk->wp;
|
||||
return &line->chks[pos];
|
||||
}
|
||||
|
||||
for (i = 1; i < lm->blk_per_line; i++) {
|
||||
rlun = &pblk->luns[i];
|
||||
ppa = rlun->bppa;
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
chunk = &line->chks[pos];
|
||||
static int pblk_line_wps_are_unbalanced(struct pblk *pblk,
|
||||
struct pblk_line *line)
|
||||
{
|
||||
struct pblk_line_meta *lm = &pblk->lm;
|
||||
int blk_in_line = lm->blk_per_line;
|
||||
struct nvm_chk_meta *chunk;
|
||||
u64 max_wp, min_wp;
|
||||
int i;
|
||||
|
||||
if (chunk->wp > line_wp)
|
||||
i = find_first_zero_bit(line->blk_bitmap, blk_in_line);
|
||||
|
||||
/* If there is one or zero good chunks in the line,
|
||||
* the write pointers can't be unbalanced.
|
||||
*/
|
||||
if (i >= (blk_in_line - 1))
|
||||
return 0;
|
||||
|
||||
chunk = pblk_get_stripe_chunk(pblk, line, i);
|
||||
max_wp = chunk->wp;
|
||||
if (max_wp > pblk->max_write_pgs)
|
||||
min_wp = max_wp - pblk->max_write_pgs;
|
||||
else
|
||||
min_wp = 0;
|
||||
|
||||
i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1);
|
||||
while (i < blk_in_line) {
|
||||
chunk = pblk_get_stripe_chunk(pblk, line, i);
|
||||
if (chunk->wp > max_wp || chunk->wp < min_wp)
|
||||
return 1;
|
||||
else if (chunk->wp < line_wp)
|
||||
line_wp = chunk->wp;
|
||||
|
||||
i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -356,7 +376,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
|
||||
int ret;
|
||||
u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec;
|
||||
|
||||
if (pblk_line_wp_is_unbalanced(pblk, line))
|
||||
if (pblk_line_wps_are_unbalanced(pblk, line))
|
||||
pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id);
|
||||
|
||||
ppa_list = p.ppa_list;
|
||||
@ -703,11 +723,13 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
|
||||
|
||||
/* The first valid instance uuid is used for initialization */
|
||||
if (!valid_uuid) {
|
||||
memcpy(pblk->instance_uuid, smeta_buf->header.uuid, 16);
|
||||
guid_copy(&pblk->instance_uuid,
|
||||
(guid_t *)&smeta_buf->header.uuid);
|
||||
valid_uuid = 1;
|
||||
}
|
||||
|
||||
if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) {
|
||||
if (!guid_equal(&pblk->instance_uuid,
|
||||
(guid_t *)&smeta_buf->header.uuid)) {
|
||||
pblk_debug(pblk, "ignore line %u due to uuid mismatch\n",
|
||||
i);
|
||||
continue;
|
||||
@ -737,7 +759,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
|
||||
}
|
||||
|
||||
if (!found_lines) {
|
||||
pblk_setup_uuid(pblk);
|
||||
guid_gen(&pblk->instance_uuid);
|
||||
|
||||
spin_lock(&l_mg->free_lock);
|
||||
WARN_ON_ONCE(!test_and_clear_bit(meta_line,
|
||||
|
@ -207,7 +207,7 @@ void pblk_rl_free(struct pblk_rl *rl)
|
||||
del_timer(&rl->u_timer);
|
||||
}
|
||||
|
||||
void pblk_rl_init(struct pblk_rl *rl, int budget)
|
||||
void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold)
|
||||
{
|
||||
struct pblk *pblk = container_of(rl, struct pblk, rl);
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
@ -217,7 +217,6 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
|
||||
int sec_meta, blk_meta;
|
||||
unsigned int rb_windows;
|
||||
|
||||
|
||||
/* Consider sectors used for metadata */
|
||||
sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
|
||||
blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
|
||||
@ -234,7 +233,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
|
||||
/* To start with, all buffer is available to user I/O writers */
|
||||
rl->rb_budget = budget;
|
||||
rl->rb_user_max = budget;
|
||||
rl->rb_max_io = budget >> 1;
|
||||
rl->rb_max_io = threshold ? (budget - threshold) : (budget - 1);
|
||||
rl->rb_gc_max = 0;
|
||||
rl->rb_state = PBLK_RL_HIGH;
|
||||
|
||||
|
@ -139,7 +139,7 @@ TRACE_EVENT(pblk_state,
|
||||
/* This part must be outside protection */
|
||||
|
||||
#undef TRACE_INCLUDE_PATH
|
||||
#define TRACE_INCLUDE_PATH ../../../drivers/lightnvm
|
||||
#define TRACE_INCLUDE_PATH ../../drivers/lightnvm
|
||||
#undef TRACE_INCLUDE_FILE
|
||||
#define TRACE_INCLUDE_FILE pblk-trace
|
||||
#include <trace/define_trace.h>
|
||||
|
@ -177,6 +177,7 @@ static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
|
||||
* re-map these entries
|
||||
*/
|
||||
line = pblk_ppa_to_line(pblk, w_ctx->ppa);
|
||||
atomic_dec(&line->sec_to_update);
|
||||
kref_put(&line->ref, pblk_line_put);
|
||||
}
|
||||
spin_unlock(&pblk->trans_lock);
|
||||
|
@ -131,8 +131,8 @@ struct pblk_pr_ctx {
|
||||
unsigned int bio_init_idx;
|
||||
void *ppa_ptr;
|
||||
dma_addr_t dma_ppa_list;
|
||||
__le64 lba_list_mem[NVM_MAX_VLBA];
|
||||
__le64 lba_list_media[NVM_MAX_VLBA];
|
||||
u64 lba_list_mem[NVM_MAX_VLBA];
|
||||
u64 lba_list_media[NVM_MAX_VLBA];
|
||||
};
|
||||
|
||||
/* Pad context */
|
||||
@ -487,6 +487,7 @@ struct pblk_line {
|
||||
__le32 *vsc; /* Valid sector count in line */
|
||||
|
||||
struct kref ref; /* Write buffer L2P references */
|
||||
atomic_t sec_to_update; /* Outstanding L2P updates to ppa */
|
||||
|
||||
struct pblk_w_err_gc *w_err_gc; /* Write error gc recovery metadata */
|
||||
|
||||
@ -646,7 +647,7 @@ struct pblk {
|
||||
|
||||
int sec_per_write;
|
||||
|
||||
unsigned char instance_uuid[16];
|
||||
guid_t instance_uuid;
|
||||
|
||||
/* Persistent write amplification counters, 4kb sector I/Os */
|
||||
atomic64_t user_wa; /* Sectors written by user */
|
||||
@ -924,7 +925,7 @@ int pblk_gc_sysfs_force(struct pblk *pblk, int force);
|
||||
/*
|
||||
* pblk rate limiter
|
||||
*/
|
||||
void pblk_rl_init(struct pblk_rl *rl, int budget);
|
||||
void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold);
|
||||
void pblk_rl_free(struct pblk_rl *rl);
|
||||
void pblk_rl_update_rates(struct pblk_rl *rl);
|
||||
int pblk_rl_high_thrs(struct pblk_rl *rl);
|
||||
@ -1360,14 +1361,6 @@ static inline unsigned int pblk_get_secs(struct bio *bio)
|
||||
return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
|
||||
}
|
||||
|
||||
static inline void pblk_setup_uuid(struct pblk *pblk)
|
||||
{
|
||||
uuid_le uuid;
|
||||
|
||||
uuid_le_gen(&uuid);
|
||||
memcpy(pblk->instance_uuid, uuid.b, 16);
|
||||
}
|
||||
|
||||
static inline char *pblk_disk_name(struct pblk *pblk)
|
||||
{
|
||||
struct gendisk *disk = pblk->disk;
|
||||
|
@ -432,8 +432,9 @@ static void do_btree_node_write(struct btree *b)
|
||||
int j;
|
||||
struct bio_vec *bv;
|
||||
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bv, b->bio, j)
|
||||
bio_for_each_segment_all(bv, b->bio, j, iter_all)
|
||||
memcpy(page_address(bv->bv_page),
|
||||
base + j * PAGE_SIZE, PAGE_SIZE);
|
||||
|
||||
|
@ -538,6 +538,7 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
|
||||
{
|
||||
struct btree *b = container_of(bk, struct btree, keys);
|
||||
unsigned int i, stale;
|
||||
char buf[80];
|
||||
|
||||
if (!KEY_PTRS(k) ||
|
||||
bch_extent_invalid(bk, k))
|
||||
@ -547,19 +548,19 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
|
||||
if (!ptr_available(b->c, k, i))
|
||||
return true;
|
||||
|
||||
if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
|
||||
return false;
|
||||
|
||||
for (i = 0; i < KEY_PTRS(k); i++) {
|
||||
stale = ptr_stale(b->c, k, i);
|
||||
|
||||
if (stale && KEY_DIRTY(k)) {
|
||||
bch_extent_to_text(buf, sizeof(buf), k);
|
||||
pr_info("stale dirty pointer, stale %u, key: %s",
|
||||
stale, buf);
|
||||
}
|
||||
|
||||
btree_bug_on(stale > BUCKET_GC_GEN_MAX, b,
|
||||
"key too stale: %i, need_gc %u",
|
||||
stale, b->c->need_gc);
|
||||
|
||||
btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
|
||||
b, "stale dirty pointer");
|
||||
|
||||
if (stale)
|
||||
return true;
|
||||
|
||||
|
@ -392,10 +392,11 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
|
||||
|
||||
/*
|
||||
* Flag for bypass if the IO is for read-ahead or background,
|
||||
* unless the read-ahead request is for metadata (eg, for gfs2).
|
||||
* unless the read-ahead request is for metadata
|
||||
* (eg, for gfs2 or xfs).
|
||||
*/
|
||||
if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
|
||||
!(bio->bi_opf & REQ_PRIO))
|
||||
!(bio->bi_opf & (REQ_META|REQ_PRIO)))
|
||||
goto skip;
|
||||
|
||||
if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
|
||||
@ -877,7 +878,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
|
||||
}
|
||||
|
||||
if (!(bio->bi_opf & REQ_RAHEAD) &&
|
||||
!(bio->bi_opf & REQ_PRIO) &&
|
||||
!(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
|
||||
s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
|
||||
reada = min_t(sector_t, dc->readahead >> 9,
|
||||
get_capacity(bio->bi_disk) - bio_end_sector(bio));
|
||||
|
@ -111,7 +111,7 @@ void bch_cache_accounting_clear(struct cache_accounting *acc)
|
||||
{
|
||||
memset(&acc->total.cache_hits,
|
||||
0,
|
||||
sizeof(unsigned long) * 7);
|
||||
sizeof(struct cache_stats));
|
||||
}
|
||||
|
||||
void bch_cache_accounting_destroy(struct cache_accounting *acc)
|
||||
|
@ -1615,21 +1615,21 @@ static void conditional_stop_bcache_device(struct cache_set *c,
|
||||
*/
|
||||
pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
|
||||
d->disk->disk_name);
|
||||
/*
|
||||
* There might be a small time gap that cache set is
|
||||
* released but bcache device is not. Inside this time
|
||||
* gap, regular I/O requests will directly go into
|
||||
* backing device as no cache set attached to. This
|
||||
* behavior may also introduce potential inconsistence
|
||||
* data in writeback mode while cache is dirty.
|
||||
* Therefore before calling bcache_device_stop() due
|
||||
* to a broken cache device, dc->io_disable should be
|
||||
* explicitly set to true.
|
||||
*/
|
||||
dc->io_disable = true;
|
||||
/* make others know io_disable is true earlier */
|
||||
smp_mb();
|
||||
bcache_device_stop(d);
|
||||
/*
|
||||
* There might be a small time gap that cache set is
|
||||
* released but bcache device is not. Inside this time
|
||||
* gap, regular I/O requests will directly go into
|
||||
* backing device as no cache set attached to. This
|
||||
* behavior may also introduce potential inconsistence
|
||||
* data in writeback mode while cache is dirty.
|
||||
* Therefore before calling bcache_device_stop() due
|
||||
* to a broken cache device, dc->io_disable should be
|
||||
* explicitly set to true.
|
||||
*/
|
||||
dc->io_disable = true;
|
||||
/* make others know io_disable is true earlier */
|
||||
smp_mb();
|
||||
bcache_device_stop(d);
|
||||
} else {
|
||||
/*
|
||||
* dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
|
||||
|
@ -67,6 +67,8 @@ read_attribute(written);
|
||||
read_attribute(btree_written);
|
||||
read_attribute(metadata_written);
|
||||
read_attribute(active_journal_entries);
|
||||
read_attribute(backing_dev_name);
|
||||
read_attribute(backing_dev_uuid);
|
||||
|
||||
sysfs_time_stats_attribute(btree_gc, sec, ms);
|
||||
sysfs_time_stats_attribute(btree_split, sec, us);
|
||||
@ -243,6 +245,19 @@ SHOW(__bch_cached_dev)
|
||||
return strlen(buf);
|
||||
}
|
||||
|
||||
if (attr == &sysfs_backing_dev_name) {
|
||||
snprintf(buf, BDEVNAME_SIZE + 1, "%s", dc->backing_dev_name);
|
||||
strcat(buf, "\n");
|
||||
return strlen(buf);
|
||||
}
|
||||
|
||||
if (attr == &sysfs_backing_dev_uuid) {
|
||||
/* convert binary uuid into 36-byte string plus '\0' */
|
||||
snprintf(buf, 36+1, "%pU", dc->sb.uuid);
|
||||
strcat(buf, "\n");
|
||||
return strlen(buf);
|
||||
}
|
||||
|
||||
#undef var
|
||||
return 0;
|
||||
}
|
||||
@ -262,10 +277,10 @@ STORE(__cached_dev)
|
||||
|
||||
sysfs_strtoul(data_csum, dc->disk.data_csum);
|
||||
d_strtoul(verify);
|
||||
d_strtoul(bypass_torture_test);
|
||||
d_strtoul(writeback_metadata);
|
||||
d_strtoul(writeback_running);
|
||||
d_strtoul(writeback_delay);
|
||||
sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test);
|
||||
sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata);
|
||||
sysfs_strtoul_bool(writeback_running, dc->writeback_running);
|
||||
sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX);
|
||||
|
||||
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
|
||||
0, bch_cutoff_writeback);
|
||||
@ -287,9 +302,15 @@ STORE(__cached_dev)
|
||||
sysfs_strtoul_clamp(writeback_rate_update_seconds,
|
||||
dc->writeback_rate_update_seconds,
|
||||
1, WRITEBACK_RATE_UPDATE_SECS_MAX);
|
||||
d_strtoul(writeback_rate_i_term_inverse);
|
||||
d_strtoul_nonzero(writeback_rate_p_term_inverse);
|
||||
d_strtoul_nonzero(writeback_rate_minimum);
|
||||
sysfs_strtoul_clamp(writeback_rate_i_term_inverse,
|
||||
dc->writeback_rate_i_term_inverse,
|
||||
1, UINT_MAX);
|
||||
sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
|
||||
dc->writeback_rate_p_term_inverse,
|
||||
1, UINT_MAX);
|
||||
sysfs_strtoul_clamp(writeback_rate_minimum,
|
||||
dc->writeback_rate_minimum,
|
||||
1, UINT_MAX);
|
||||
|
||||
sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX);
|
||||
|
||||
@ -299,7 +320,9 @@ STORE(__cached_dev)
|
||||
dc->io_disable = v ? 1 : 0;
|
||||
}
|
||||
|
||||
d_strtoi_h(sequential_cutoff);
|
||||
sysfs_strtoul_clamp(sequential_cutoff,
|
||||
dc->sequential_cutoff,
|
||||
0, UINT_MAX);
|
||||
d_strtoi_h(readahead);
|
||||
|
||||
if (attr == &sysfs_clear_stats)
|
||||
@ -452,6 +475,8 @@ static struct attribute *bch_cached_dev_files[] = {
|
||||
&sysfs_verify,
|
||||
&sysfs_bypass_torture_test,
|
||||
#endif
|
||||
&sysfs_backing_dev_name,
|
||||
&sysfs_backing_dev_uuid,
|
||||
NULL
|
||||
};
|
||||
KTYPE(bch_cached_dev);
|
||||
@ -761,10 +786,12 @@ STORE(__bch_cache_set)
|
||||
c->shrink.scan_objects(&c->shrink, &sc);
|
||||
}
|
||||
|
||||
sysfs_strtoul(congested_read_threshold_us,
|
||||
c->congested_read_threshold_us);
|
||||
sysfs_strtoul(congested_write_threshold_us,
|
||||
c->congested_write_threshold_us);
|
||||
sysfs_strtoul_clamp(congested_read_threshold_us,
|
||||
c->congested_read_threshold_us,
|
||||
0, UINT_MAX);
|
||||
sysfs_strtoul_clamp(congested_write_threshold_us,
|
||||
c->congested_write_threshold_us,
|
||||
0, UINT_MAX);
|
||||
|
||||
if (attr == &sysfs_errors) {
|
||||
v = __sysfs_match_string(error_actions, -1, buf);
|
||||
@ -774,12 +801,20 @@ STORE(__bch_cache_set)
|
||||
c->on_error = v;
|
||||
}
|
||||
|
||||
if (attr == &sysfs_io_error_limit)
|
||||
c->error_limit = strtoul_or_return(buf);
|
||||
sysfs_strtoul_clamp(io_error_limit, c->error_limit, 0, UINT_MAX);
|
||||
|
||||
/* See count_io_errors() for why 88 */
|
||||
if (attr == &sysfs_io_error_halflife)
|
||||
c->error_decay = strtoul_or_return(buf) / 88;
|
||||
if (attr == &sysfs_io_error_halflife) {
|
||||
unsigned long v = 0;
|
||||
ssize_t ret;
|
||||
|
||||
ret = strtoul_safe_clamp(buf, v, 0, UINT_MAX);
|
||||
if (!ret) {
|
||||
c->error_decay = v / 88;
|
||||
return size;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (attr == &sysfs_io_disable) {
|
||||
v = strtoul_or_return(buf);
|
||||
@ -794,13 +829,15 @@ STORE(__bch_cache_set)
|
||||
}
|
||||
}
|
||||
|
||||
sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
|
||||
sysfs_strtoul(verify, c->verify);
|
||||
sysfs_strtoul(key_merging_disabled, c->key_merging_disabled);
|
||||
sysfs_strtoul_clamp(journal_delay_ms,
|
||||
c->journal_delay_ms,
|
||||
0, USHRT_MAX);
|
||||
sysfs_strtoul_bool(verify, c->verify);
|
||||
sysfs_strtoul_bool(key_merging_disabled, c->key_merging_disabled);
|
||||
sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks);
|
||||
sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
|
||||
sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
|
||||
sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
|
||||
sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite);
|
||||
sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled);
|
||||
sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled);
|
||||
/*
|
||||
* write gc_after_writeback here may overwrite an already set
|
||||
* BCH_DO_AUTO_GC, it doesn't matter because this flag will be
|
||||
|
@ -79,11 +79,28 @@ do { \
|
||||
return strtoul_safe(buf, var) ?: (ssize_t) size; \
|
||||
} while (0)
|
||||
|
||||
#define sysfs_strtoul_bool(file, var) \
|
||||
do { \
|
||||
if (attr == &sysfs_ ## file) { \
|
||||
unsigned long v = strtoul_or_return(buf); \
|
||||
\
|
||||
var = v ? 1 : 0; \
|
||||
return size; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define sysfs_strtoul_clamp(file, var, min, max) \
|
||||
do { \
|
||||
if (attr == &sysfs_ ## file) \
|
||||
return strtoul_safe_clamp(buf, var, min, max) \
|
||||
?: (ssize_t) size; \
|
||||
if (attr == &sysfs_ ## file) { \
|
||||
unsigned long v = 0; \
|
||||
ssize_t ret; \
|
||||
ret = strtoul_safe_clamp(buf, v, min, max); \
|
||||
if (!ret) { \
|
||||
var = v; \
|
||||
return size; \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define strtoul_or_return(cp) \
|
||||
|
@ -270,7 +270,11 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
|
||||
int i;
|
||||
struct bio_vec *bv;
|
||||
|
||||
bio_for_each_segment_all(bv, bio, i) {
|
||||
/*
|
||||
* This is called on freshly new bio, so it is safe to access the
|
||||
* bvec table directly.
|
||||
*/
|
||||
for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) {
|
||||
bv->bv_page = alloc_page(gfp_mask);
|
||||
if (!bv->bv_page) {
|
||||
while (--bv >= bio->bi_io_vec)
|
||||
|
@ -71,6 +71,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
|
||||
in_use > bch_cutoff_writeback_sync)
|
||||
return false;
|
||||
|
||||
if (bio_op(bio) == REQ_OP_DISCARD)
|
||||
return false;
|
||||
|
||||
if (dc->partial_stripes_expensive &&
|
||||
bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector,
|
||||
bio_sectors(bio)))
|
||||
|
@ -1447,8 +1447,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
|
||||
{
|
||||
unsigned int i;
|
||||
struct bio_vec *bv;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bv, clone, i) {
|
||||
bio_for_each_segment_all(bv, clone, i, iter_all) {
|
||||
BUG_ON(!bv->bv_page);
|
||||
mempool_free(bv->bv_page, &cc->page_pool);
|
||||
}
|
||||
|
@ -527,7 +527,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
|
||||
md->tag_set->ops = &dm_mq_ops;
|
||||
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
|
||||
md->tag_set->numa_node = md->numa_node_id;
|
||||
md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
|
||||
md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
|
||||
md->tag_set->driver_data = md;
|
||||
|
||||
|
@ -1698,14 +1698,6 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
|
||||
return q && !blk_queue_add_random(q);
|
||||
}
|
||||
|
||||
static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
|
||||
sector_t start, sector_t len, void *data)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(dev->bdev);
|
||||
|
||||
return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
|
||||
}
|
||||
|
||||
static bool dm_table_all_devices_attribute(struct dm_table *t,
|
||||
iterate_devices_callout_fn func)
|
||||
{
|
||||
@ -1902,11 +1894,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
|
||||
if (!dm_table_supports_write_zeroes(t))
|
||||
q->limits.max_write_zeroes_sectors = 0;
|
||||
|
||||
if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
|
||||
blk_queue_flag_clear(QUEUE_FLAG_NO_SG_MERGE, q);
|
||||
else
|
||||
blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
|
||||
|
||||
dm_table_verify_integrity(t);
|
||||
|
||||
/*
|
||||
|
@ -96,8 +96,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
|
||||
int i, cnt;
|
||||
bool discard_supported = false;
|
||||
|
||||
conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
|
||||
GFP_KERNEL);
|
||||
conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
|
||||
if (!conf)
|
||||
return NULL;
|
||||
|
||||
|
@ -1603,11 +1603,9 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
|
||||
return;
|
||||
}
|
||||
set_bit(Blocked, &rdev->flags);
|
||||
if (test_and_clear_bit(In_sync, &rdev->flags)) {
|
||||
if (test_and_clear_bit(In_sync, &rdev->flags))
|
||||
mddev->degraded++;
|
||||
set_bit(Faulty, &rdev->flags);
|
||||
} else
|
||||
set_bit(Faulty, &rdev->flags);
|
||||
set_bit(Faulty, &rdev->flags);
|
||||
spin_unlock_irqrestore(&conf->device_lock, flags);
|
||||
/*
|
||||
* if recovery is running, make sure it aborts.
|
||||
@ -2120,13 +2118,14 @@ static void process_checks(struct r1bio *r1_bio)
|
||||
struct page **spages = get_resync_pages(sbio)->pages;
|
||||
struct bio_vec *bi;
|
||||
int page_len[RESYNC_PAGES] = { 0 };
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
if (sbio->bi_end_io != end_sync_read)
|
||||
continue;
|
||||
/* Now we can 'fixup' the error value */
|
||||
sbio->bi_status = 0;
|
||||
|
||||
bio_for_each_segment_all(bi, sbio, j)
|
||||
bio_for_each_segment_all(bi, sbio, j, iter_all)
|
||||
page_len[j] = bi->bv_len;
|
||||
|
||||
if (!status) {
|
||||
|
@ -417,8 +417,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card)
|
||||
else
|
||||
mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
|
||||
mq->tag_set.numa_node = NUMA_NO_NODE;
|
||||
mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE |
|
||||
BLK_MQ_F_BLOCKING;
|
||||
mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
|
||||
mq->tag_set.nr_hw_queues = 1;
|
||||
mq->tag_set.cmd_size = sizeof(struct mmc_queue_req);
|
||||
mq->tag_set.driver_data = mq;
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NVM Express device driver
|
||||
* Copyright (c) 2011-2014, Intel Corporation.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
@ -151,11 +143,8 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
|
||||
|
||||
static void nvme_delete_ctrl_work(struct work_struct *work)
|
||||
static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_ctrl *ctrl =
|
||||
container_of(work, struct nvme_ctrl, delete_work);
|
||||
|
||||
dev_info(ctrl->device,
|
||||
"Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
|
||||
|
||||
@ -167,6 +156,14 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
|
||||
nvme_put_ctrl(ctrl);
|
||||
}
|
||||
|
||||
static void nvme_delete_ctrl_work(struct work_struct *work)
|
||||
{
|
||||
struct nvme_ctrl *ctrl =
|
||||
container_of(work, struct nvme_ctrl, delete_work);
|
||||
|
||||
nvme_do_delete_ctrl(ctrl);
|
||||
}
|
||||
|
||||
int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
|
||||
@ -177,7 +174,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
|
||||
|
||||
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
|
||||
static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
@ -186,13 +183,13 @@ int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
|
||||
* can free the controller.
|
||||
*/
|
||||
nvme_get_ctrl(ctrl);
|
||||
ret = nvme_delete_ctrl(ctrl);
|
||||
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
|
||||
ret = -EBUSY;
|
||||
if (!ret)
|
||||
flush_work(&ctrl->delete_work);
|
||||
nvme_do_delete_ctrl(ctrl);
|
||||
nvme_put_ctrl(ctrl);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
|
||||
|
||||
static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
|
||||
{
|
||||
@ -611,6 +608,22 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
|
||||
struct request *req, struct nvme_command *cmnd)
|
||||
{
|
||||
if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
|
||||
return nvme_setup_discard(ns, req, cmnd);
|
||||
|
||||
cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
|
||||
cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
|
||||
cmnd->write_zeroes.slba =
|
||||
cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
|
||||
cmnd->write_zeroes.length =
|
||||
cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
|
||||
cmnd->write_zeroes.control = 0;
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
|
||||
struct request *req, struct nvme_command *cmnd)
|
||||
{
|
||||
@ -705,7 +718,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
|
||||
nvme_setup_flush(ns, cmd);
|
||||
break;
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
/* currently only aliased to deallocate for a few ctrls: */
|
||||
ret = nvme_setup_write_zeroes(ns, req, cmd);
|
||||
break;
|
||||
case REQ_OP_DISCARD:
|
||||
ret = nvme_setup_discard(ns, req, cmd);
|
||||
break;
|
||||
@ -1512,6 +1526,37 @@ static void nvme_config_discard(struct nvme_ns *ns)
|
||||
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
|
||||
}
|
||||
|
||||
static inline void nvme_config_write_zeroes(struct nvme_ns *ns)
|
||||
{
|
||||
u32 max_sectors;
|
||||
unsigned short bs = 1 << ns->lba_shift;
|
||||
|
||||
if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES))
|
||||
return;
|
||||
/*
|
||||
* Even though NVMe spec explicitly states that MDTS is not
|
||||
* applicable to the write-zeroes:- "The restriction does not apply to
|
||||
* commands that do not transfer data between the host and the
|
||||
* controller (e.g., Write Uncorrectable ro Write Zeroes command).".
|
||||
* In order to be more cautious use controller's max_hw_sectors value
|
||||
* to configure the maximum sectors for the write-zeroes which is
|
||||
* configured based on the controller's MDTS field in the
|
||||
* nvme_init_identify() if available.
|
||||
*/
|
||||
if (ns->ctrl->max_hw_sectors == UINT_MAX)
|
||||
max_sectors = ((u32)(USHRT_MAX + 1) * bs) >> 9;
|
||||
else
|
||||
max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9;
|
||||
|
||||
blk_queue_max_write_zeroes_sectors(ns->queue, max_sectors);
|
||||
}
|
||||
|
||||
static inline void nvme_ns_config_oncs(struct nvme_ns *ns)
|
||||
{
|
||||
nvme_config_discard(ns);
|
||||
nvme_config_write_zeroes(ns);
|
||||
}
|
||||
|
||||
static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
|
||||
struct nvme_id_ns *id, struct nvme_ns_ids *ids)
|
||||
{
|
||||
@ -1565,7 +1610,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
|
||||
capacity = 0;
|
||||
|
||||
set_capacity(disk, capacity);
|
||||
nvme_config_discard(ns);
|
||||
nvme_ns_config_oncs(ns);
|
||||
|
||||
if (id->nsattr & (1 << 0))
|
||||
set_disk_ro(disk, true);
|
||||
@ -2280,6 +2325,9 @@ static struct attribute *nvme_subsys_attrs[] = {
|
||||
&subsys_attr_serial.attr,
|
||||
&subsys_attr_firmware_rev.attr,
|
||||
&subsys_attr_subsysnqn.attr,
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
&subsys_attr_iopolicy.attr,
|
||||
#endif
|
||||
NULL,
|
||||
};
|
||||
|
||||
@ -2332,6 +2380,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
||||
memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
|
||||
subsys->vendor_id = le16_to_cpu(id->vid);
|
||||
subsys->cmic = id->cmic;
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
subsys->iopolicy = NVME_IOPOLICY_NUMA;
|
||||
#endif
|
||||
|
||||
subsys->dev.class = nvme_subsys_class;
|
||||
subsys->dev.release = nvme_release_subsystem;
|
||||
@ -3163,21 +3214,23 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
||||
static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
||||
{
|
||||
struct nvme_ns *ns;
|
||||
struct gendisk *disk;
|
||||
struct nvme_id_ns *id;
|
||||
char disk_name[DISK_NAME_LEN];
|
||||
int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT;
|
||||
int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
|
||||
|
||||
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
|
||||
if (!ns)
|
||||
return;
|
||||
return -ENOMEM;
|
||||
|
||||
ns->queue = blk_mq_init_queue(ctrl->tagset);
|
||||
if (IS_ERR(ns->queue))
|
||||
if (IS_ERR(ns->queue)) {
|
||||
ret = PTR_ERR(ns->queue);
|
||||
goto out_free_ns;
|
||||
}
|
||||
|
||||
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
|
||||
if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
|
||||
@ -3193,20 +3246,27 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
||||
nvme_set_queue_limits(ctrl, ns->queue);
|
||||
|
||||
id = nvme_identify_ns(ctrl, nsid);
|
||||
if (!id)
|
||||
if (!id) {
|
||||
ret = -EIO;
|
||||
goto out_free_queue;
|
||||
}
|
||||
|
||||
if (id->ncap == 0)
|
||||
if (id->ncap == 0) {
|
||||
ret = -EINVAL;
|
||||
goto out_free_id;
|
||||
}
|
||||
|
||||
if (nvme_init_ns_head(ns, nsid, id))
|
||||
ret = nvme_init_ns_head(ns, nsid, id);
|
||||
if (ret)
|
||||
goto out_free_id;
|
||||
nvme_setup_streams_ns(ctrl, ns);
|
||||
nvme_set_disk_name(disk_name, ns, ctrl, &flags);
|
||||
|
||||
disk = alloc_disk_node(0, node);
|
||||
if (!disk)
|
||||
if (!disk) {
|
||||
ret = -ENOMEM;
|
||||
goto out_unlink_ns;
|
||||
}
|
||||
|
||||
disk->fops = &nvme_fops;
|
||||
disk->private_data = ns;
|
||||
@ -3218,7 +3278,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
||||
__nvme_revalidate_disk(disk, id);
|
||||
|
||||
if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
|
||||
if (nvme_nvm_register(ns, disk_name, node)) {
|
||||
ret = nvme_nvm_register(ns, disk_name, node);
|
||||
if (ret) {
|
||||
dev_warn(ctrl->device, "LightNVM init failure\n");
|
||||
goto out_put_disk;
|
||||
}
|
||||
@ -3236,7 +3297,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
||||
nvme_fault_inject_init(ns);
|
||||
kfree(id);
|
||||
|
||||
return;
|
||||
return 0;
|
||||
out_put_disk:
|
||||
put_disk(ns->disk);
|
||||
out_unlink_ns:
|
||||
@ -3249,6 +3310,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
||||
blk_cleanup_queue(ns->queue);
|
||||
out_free_ns:
|
||||
kfree(ns);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void nvme_ns_remove(struct nvme_ns *ns)
|
||||
@ -3596,8 +3658,6 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
|
||||
nvme_stop_keep_alive(ctrl);
|
||||
flush_work(&ctrl->async_event_work);
|
||||
cancel_work_sync(&ctrl->fw_act_work);
|
||||
if (ctrl->ops->stop_ctrl)
|
||||
ctrl->ops->stop_ctrl(ctrl);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
|
||||
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NVMe over Fabrics common host code.
|
||||
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/init.h>
|
||||
@ -430,6 +422,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue);
|
||||
* @qid: NVMe I/O queue number for the new I/O connection between
|
||||
* host and target (note qid == 0 is illegal as this is
|
||||
* the Admin queue, per NVMe standard).
|
||||
* @poll: Whether or not to poll for the completion of the connect cmd.
|
||||
*
|
||||
* This function issues a fabrics-protocol connection
|
||||
* of a NVMe I/O queue (via NVMe Fabrics "Connect" command)
|
||||
|
@ -1,15 +1,7 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* NVMe over Fabrics common host code.
|
||||
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
#ifndef _NVME_FABRICS_H
|
||||
#define _NVME_FABRICS_H 1
|
||||
|
@ -1,8 +1,8 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* fault injection support for nvme.
|
||||
*
|
||||
* Copyright (c) 2018, Oracle and/or its affiliates
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/moduleparam.h>
|
||||
|
@ -1,18 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (c) 2016 Avago Technologies. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of version 2 of the GNU General Public License as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful.
|
||||
* ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
|
||||
* INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
|
||||
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
|
||||
* THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
|
||||
* See the GNU General Public License for more details, a copy of which
|
||||
* can be found in the file COPYING included with this package
|
||||
*
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/module.h>
|
||||
|
@ -1,23 +1,9 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* nvme-lightnvm.c - LightNVM NVMe device
|
||||
*
|
||||
* Copyright (C) 2014-2015 IT University of Copenhagen
|
||||
* Initial release: Matias Bjorling <mb@lightnvm.io>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License version
|
||||
* 2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
|
||||
* USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "nvme.h"
|
||||
|
@ -1,14 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (c) 2017-2018 Christoph Hellwig.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/moduleparam.h>
|
||||
@ -141,7 +133,10 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
|
||||
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
|
||||
continue;
|
||||
|
||||
distance = node_distance(node, ns->ctrl->numa_node);
|
||||
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
|
||||
distance = node_distance(node, ns->ctrl->numa_node);
|
||||
else
|
||||
distance = LOCAL_DISTANCE;
|
||||
|
||||
switch (ns->ana_state) {
|
||||
case NVME_ANA_OPTIMIZED:
|
||||
@ -168,6 +163,47 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
|
||||
return found;
|
||||
}
|
||||
|
||||
static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
|
||||
struct nvme_ns *ns)
|
||||
{
|
||||
ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
|
||||
siblings);
|
||||
if (ns)
|
||||
return ns;
|
||||
return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
|
||||
}
|
||||
|
||||
static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
|
||||
int node, struct nvme_ns *old)
|
||||
{
|
||||
struct nvme_ns *ns, *found, *fallback = NULL;
|
||||
|
||||
if (list_is_singular(&head->list))
|
||||
return old;
|
||||
|
||||
for (ns = nvme_next_ns(head, old);
|
||||
ns != old;
|
||||
ns = nvme_next_ns(head, ns)) {
|
||||
if (ns->ctrl->state != NVME_CTRL_LIVE ||
|
||||
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
|
||||
continue;
|
||||
|
||||
if (ns->ana_state == NVME_ANA_OPTIMIZED) {
|
||||
found = ns;
|
||||
goto out;
|
||||
}
|
||||
if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
|
||||
fallback = ns;
|
||||
}
|
||||
|
||||
if (!fallback)
|
||||
return NULL;
|
||||
found = fallback;
|
||||
out:
|
||||
rcu_assign_pointer(head->current_path[node], found);
|
||||
return found;
|
||||
}
|
||||
|
||||
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
|
||||
{
|
||||
return ns->ctrl->state == NVME_CTRL_LIVE &&
|
||||
@ -180,6 +216,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
|
||||
struct nvme_ns *ns;
|
||||
|
||||
ns = srcu_dereference(head->current_path[node], &head->srcu);
|
||||
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
|
||||
ns = nvme_round_robin_path(head, node, ns);
|
||||
if (unlikely(!ns || !nvme_path_is_optimized(ns)))
|
||||
ns = __nvme_find_path(head, node);
|
||||
return ns;
|
||||
@ -471,6 +509,44 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl)
|
||||
cancel_work_sync(&ctrl->ana_work);
|
||||
}
|
||||
|
||||
#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
|
||||
struct device_attribute subsys_attr_##_name = \
|
||||
__ATTR(_name, _mode, _show, _store)
|
||||
|
||||
static const char *nvme_iopolicy_names[] = {
|
||||
[NVME_IOPOLICY_NUMA] = "numa",
|
||||
[NVME_IOPOLICY_RR] = "round-robin",
|
||||
};
|
||||
|
||||
static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct nvme_subsystem *subsys =
|
||||
container_of(dev, struct nvme_subsystem, dev);
|
||||
|
||||
return sprintf(buf, "%s\n",
|
||||
nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
|
||||
}
|
||||
|
||||
static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t count)
|
||||
{
|
||||
struct nvme_subsystem *subsys =
|
||||
container_of(dev, struct nvme_subsystem, dev);
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
|
||||
if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
|
||||
WRITE_ONCE(subsys->iopolicy, i);
|
||||
return count;
|
||||
}
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
|
||||
nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
|
||||
|
||||
static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
|
@ -1,14 +1,6 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (c) 2011-2014, Intel Corporation.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#ifndef _NVME_H
|
||||
@ -252,6 +244,11 @@ struct nvme_ctrl {
|
||||
unsigned long discard_page_busy;
|
||||
};
|
||||
|
||||
enum nvme_iopolicy {
|
||||
NVME_IOPOLICY_NUMA,
|
||||
NVME_IOPOLICY_RR,
|
||||
};
|
||||
|
||||
struct nvme_subsystem {
|
||||
int instance;
|
||||
struct device dev;
|
||||
@ -271,6 +268,9 @@ struct nvme_subsystem {
|
||||
u8 cmic;
|
||||
u16 vendor_id;
|
||||
struct ida ns_ida;
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
enum nvme_iopolicy iopolicy;
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
@ -364,7 +364,6 @@ struct nvme_ctrl_ops {
|
||||
void (*submit_async_event)(struct nvme_ctrl *ctrl);
|
||||
void (*delete_ctrl)(struct nvme_ctrl *ctrl);
|
||||
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
|
||||
void (*stop_ctrl)(struct nvme_ctrl *ctrl);
|
||||
};
|
||||
|
||||
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
|
||||
@ -459,7 +458,6 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
|
||||
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
|
||||
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
|
||||
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
|
||||
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
|
||||
|
||||
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
|
||||
void *log, size_t size, u64 offset);
|
||||
@ -492,6 +490,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
|
||||
|
||||
extern struct device_attribute dev_attr_ana_grpid;
|
||||
extern struct device_attribute dev_attr_ana_state;
|
||||
extern struct device_attribute subsys_attr_iopolicy;
|
||||
|
||||
#else
|
||||
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NVM Express device driver
|
||||
* Copyright (c) 2011-2014, Intel Corporation.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/aer.h>
|
||||
@ -157,6 +149,8 @@ static int queue_count_set(const char *val, const struct kernel_param *kp)
|
||||
int n = 0, ret;
|
||||
|
||||
ret = kstrtoint(val, 10, &n);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (n > num_possible_cpus())
|
||||
n = num_possible_cpus();
|
||||
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NVMe over Fabrics RDMA host code.
|
||||
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/module.h>
|
||||
@ -942,14 +934,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
|
||||
}
|
||||
}
|
||||
|
||||
static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
|
||||
{
|
||||
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
|
||||
|
||||
cancel_work_sync(&ctrl->err_work);
|
||||
cancel_delayed_work_sync(&ctrl->reconnect_work);
|
||||
}
|
||||
|
||||
static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
|
||||
{
|
||||
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
|
||||
@ -1158,7 +1142,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
|
||||
struct nvme_rdma_device *dev = queue->device;
|
||||
struct ib_device *ibdev = dev->dev;
|
||||
|
||||
if (!blk_rq_payload_bytes(rq))
|
||||
if (!blk_rq_nr_phys_segments(rq))
|
||||
return;
|
||||
|
||||
if (req->mr) {
|
||||
@ -1281,7 +1265,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
|
||||
|
||||
c->common.flags |= NVME_CMD_SGL_METABUF;
|
||||
|
||||
if (!blk_rq_payload_bytes(rq))
|
||||
if (!blk_rq_nr_phys_segments(rq))
|
||||
return nvme_rdma_set_sg_null(c);
|
||||
|
||||
req->sg_table.sgl = req->first_sgl;
|
||||
@ -1854,6 +1838,9 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
|
||||
|
||||
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
|
||||
{
|
||||
cancel_work_sync(&ctrl->err_work);
|
||||
cancel_delayed_work_sync(&ctrl->reconnect_work);
|
||||
|
||||
nvme_rdma_teardown_io_queues(ctrl, shutdown);
|
||||
if (shutdown)
|
||||
nvme_shutdown_ctrl(&ctrl->ctrl);
|
||||
@ -1902,7 +1889,6 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
|
||||
.submit_async_event = nvme_rdma_submit_async_event,
|
||||
.delete_ctrl = nvme_rdma_delete_ctrl,
|
||||
.get_address = nvmf_get_address,
|
||||
.stop_ctrl = nvme_rdma_stop_ctrl,
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -1822,6 +1822,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
|
||||
|
||||
static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
|
||||
{
|
||||
cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
|
||||
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
|
||||
|
||||
nvme_tcp_teardown_io_queues(ctrl, shutdown);
|
||||
if (shutdown)
|
||||
nvme_shutdown_ctrl(ctrl);
|
||||
@ -1859,12 +1862,6 @@ out_fail:
|
||||
nvme_tcp_reconnect_or_remove(ctrl);
|
||||
}
|
||||
|
||||
static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
|
||||
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
|
||||
}
|
||||
|
||||
static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
|
||||
{
|
||||
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
|
||||
@ -2115,7 +2112,6 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
|
||||
.submit_async_event = nvme_tcp_submit_async_event,
|
||||
.delete_ctrl = nvme_tcp_delete_ctrl,
|
||||
.get_address = nvmf_get_address,
|
||||
.stop_ctrl = nvme_tcp_stop_ctrl,
|
||||
};
|
||||
|
||||
static bool
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NVM Express device driver tracepoints
|
||||
* Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <asm/unaligned.h>
|
||||
|
@ -1,15 +1,7 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* NVM Express device driver tracepoints
|
||||
* Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#undef TRACE_SYSTEM
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NVMe admin command implementation.
|
||||
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/module.h>
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Configfs interface for the NVMe target.
|
||||
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/kernel.h>
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Common code for the NVMe target.
|
||||
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/module.h>
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Discovery service for the NVMe over Fabrics target.
|
||||
* Copyright (C) 2016 Intel Corporation. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License version
|
||||
* 2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/slab.h>
|
||||
@ -331,7 +323,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
|
||||
cmd->get_log_page.lid);
|
||||
req->error_loc =
|
||||
offsetof(struct nvme_get_log_page_command, lid);
|
||||
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
|
||||
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
|
||||
}
|
||||
case nvme_admin_identify:
|
||||
req->data_len = NVME_IDENTIFY_DATA_SIZE;
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NVMe Fabrics command implementation.
|
||||
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/blkdev.h>
|
||||
|
@ -1,18 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (c) 2016 Avago Technologies. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of version 2 of the GNU General Public License as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful.
|
||||
* ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
|
||||
* INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
|
||||
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
|
||||
* THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
|
||||
* See the GNU General Public License for more details, a copy of which
|
||||
* can be found in the file COPYING included with this package
|
||||
*
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/module.h>
|
||||
|
@ -1,17 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (c) 2016 Avago Technologies. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of version 2 of the GNU General Public License as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful.
|
||||
* ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
|
||||
* INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
|
||||
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
|
||||
* THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
|
||||
* See the GNU General Public License for more details, a copy of which
|
||||
* can be found in the file COPYING included with this package
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/module.h>
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NVMe I/O command implementation.
|
||||
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/blkdev.h>
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NVMe over Fabrics loopback device.
|
||||
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/scatterlist.h>
|
||||
|
@ -1,14 +1,6 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#ifndef _NVMET_H
|
||||
|
@ -1,15 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NVMe over Fabrics RDMA target.
|
||||
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/atomic.h>
|
||||
|
@ -1900,7 +1900,7 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
|
||||
shost->tag_set.queue_depth = shost->can_queue;
|
||||
shost->tag_set.cmd_size = cmd_size;
|
||||
shost->tag_set.numa_node = NUMA_NO_NODE;
|
||||
shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
|
||||
shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
shost->tag_set.flags |=
|
||||
BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
|
||||
shost->tag_set.driver_data = shost;
|
||||
|
@ -20,8 +20,9 @@ static inline void read_endio(struct bio *bio)
|
||||
int i;
|
||||
struct bio_vec *bvec;
|
||||
const blk_status_t err = bio->bi_status;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
struct page *page = bvec->bv_page;
|
||||
|
||||
/* page is already locked */
|
||||
|
@ -849,8 +849,9 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
|
||||
#ifdef EROFS_FS_HAS_MANAGED_CACHE
|
||||
struct address_space *mc = NULL;
|
||||
#endif
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
struct page *page = bvec->bv_page;
|
||||
bool cachemngd = false;
|
||||
|
||||
|
@ -211,6 +211,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
|
||||
ssize_t ret;
|
||||
blk_qc_t qc;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
if ((pos | iov_iter_alignment(iter)) &
|
||||
(bdev_logical_block_size(bdev) - 1))
|
||||
@ -247,7 +248,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
|
||||
task_io_account_write(ret);
|
||||
}
|
||||
if (iocb->ki_flags & IOCB_HIPRI)
|
||||
bio.bi_opf |= REQ_HIPRI;
|
||||
bio_set_polled(&bio, iocb);
|
||||
|
||||
qc = submit_bio(&bio);
|
||||
for (;;) {
|
||||
@ -260,7 +261,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
bio_for_each_segment_all(bvec, &bio, i) {
|
||||
bio_for_each_segment_all(bvec, &bio, i, iter_all) {
|
||||
if (should_dirty && !PageCompound(bvec->bv_page))
|
||||
set_page_dirty_lock(bvec->bv_page);
|
||||
put_page(bvec->bv_page);
|
||||
@ -293,6 +294,14 @@ struct blkdev_dio {
|
||||
|
||||
static struct bio_set blkdev_dio_pool;
|
||||
|
||||
static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
|
||||
{
|
||||
struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
|
||||
struct request_queue *q = bdev_get_queue(bdev);
|
||||
|
||||
return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
|
||||
}
|
||||
|
||||
static void blkdev_bio_end_io(struct bio *bio)
|
||||
{
|
||||
struct blkdev_dio *dio = bio->bi_private;
|
||||
@ -329,8 +338,9 @@ static void blkdev_bio_end_io(struct bio *bio)
|
||||
} else {
|
||||
struct bio_vec *bvec;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i)
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all)
|
||||
put_page(bvec->bv_page);
|
||||
bio_put(bio);
|
||||
}
|
||||
@ -406,10 +416,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
|
||||
|
||||
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
|
||||
if (!nr_pages) {
|
||||
if (iocb->ki_flags & IOCB_HIPRI)
|
||||
bio->bi_opf |= REQ_HIPRI;
|
||||
bool polled = false;
|
||||
|
||||
if (iocb->ki_flags & IOCB_HIPRI) {
|
||||
bio_set_polled(bio, iocb);
|
||||
polled = true;
|
||||
}
|
||||
|
||||
qc = submit_bio(bio);
|
||||
|
||||
if (polled)
|
||||
WRITE_ONCE(iocb->ki_cookie, qc);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -2076,6 +2093,7 @@ const struct file_operations def_blk_fops = {
|
||||
.llseek = block_llseek,
|
||||
.read_iter = blkdev_read_iter,
|
||||
.write_iter = blkdev_write_iter,
|
||||
.iopoll = blkdev_iopoll,
|
||||
.mmap = generic_file_mmap,
|
||||
.fsync = blkdev_fsync,
|
||||
.unlocked_ioctl = block_ioctl,
|
||||
|
@ -162,13 +162,14 @@ csum_failed:
|
||||
} else {
|
||||
int i;
|
||||
struct bio_vec *bvec;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
/*
|
||||
* we have verified the checksum already, set page
|
||||
* checked so the end_io handlers know about it
|
||||
*/
|
||||
ASSERT(!bio_flagged(bio, BIO_CLONED));
|
||||
bio_for_each_segment_all(bvec, cb->orig_bio, i)
|
||||
bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all)
|
||||
SetPageChecked(bvec->bv_page);
|
||||
|
||||
bio_endio(cb->orig_bio);
|
||||
|
@ -833,9 +833,10 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
|
||||
struct bio_vec *bvec;
|
||||
struct btrfs_root *root;
|
||||
int i, ret = 0;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
ASSERT(!bio_flagged(bio, BIO_CLONED));
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
|
||||
ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
|
||||
if (ret)
|
||||
|
@ -152,11 +152,12 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
|
||||
{
|
||||
blk_status_t ret = 0;
|
||||
struct bio_vec *bvec = bio_last_bvec_all(bio);
|
||||
struct page *page = bvec->bv_page;
|
||||
struct bio_vec bv;
|
||||
struct extent_io_tree *tree = bio->bi_private;
|
||||
u64 start;
|
||||
|
||||
start = page_offset(page) + bvec->bv_offset;
|
||||
mp_bvec_last_segment(bvec, &bv);
|
||||
start = page_offset(bv.bv_page) + bv.bv_offset;
|
||||
|
||||
bio->bi_private = NULL;
|
||||
|
||||
@ -2379,7 +2380,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
|
||||
int read_mode = 0;
|
||||
blk_status_t status;
|
||||
int ret;
|
||||
unsigned failed_bio_pages = bio_pages_all(failed_bio);
|
||||
unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
|
||||
|
||||
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
|
||||
|
||||
@ -2451,9 +2452,10 @@ static void end_bio_extent_writepage(struct bio *bio)
|
||||
u64 start;
|
||||
u64 end;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
ASSERT(!bio_flagged(bio, BIO_CLONED));
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
struct page *page = bvec->bv_page;
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
||||
@ -2522,9 +2524,10 @@ static void end_bio_extent_readpage(struct bio *bio)
|
||||
int mirror;
|
||||
int ret;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
ASSERT(!bio_flagged(bio, BIO_CLONED));
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
struct page *page = bvec->bv_page;
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
||||
@ -3641,9 +3644,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
|
||||
struct bio_vec *bvec;
|
||||
struct extent_buffer *eb;
|
||||
int i, done;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
ASSERT(!bio_flagged(bio, BIO_CLONED));
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
struct page *page = bvec->bv_page;
|
||||
|
||||
eb = (struct extent_buffer *)page->private;
|
||||
|
@ -7829,6 +7829,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
|
||||
struct bio_vec *bvec;
|
||||
struct extent_io_tree *io_tree, *failure_tree;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
if (bio->bi_status)
|
||||
goto end;
|
||||
@ -7840,7 +7841,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
|
||||
|
||||
done->uptodate = 1;
|
||||
ASSERT(!bio_flagged(bio, BIO_CLONED));
|
||||
bio_for_each_segment_all(bvec, bio, i)
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all)
|
||||
clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
|
||||
io_tree, done->start, bvec->bv_page,
|
||||
btrfs_ino(BTRFS_I(inode)), 0);
|
||||
@ -7919,6 +7920,7 @@ static void btrfs_retry_endio(struct bio *bio)
|
||||
int uptodate;
|
||||
int ret;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
if (bio->bi_status)
|
||||
goto end;
|
||||
@ -7932,7 +7934,7 @@ static void btrfs_retry_endio(struct bio *bio)
|
||||
failure_tree = &BTRFS_I(inode)->io_failure_tree;
|
||||
|
||||
ASSERT(!bio_flagged(bio, BIO_CLONED));
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
|
||||
bvec->bv_offset, done->start,
|
||||
bvec->bv_len);
|
||||
|
@ -1443,10 +1443,11 @@ static void set_bio_pages_uptodate(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bvec;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
ASSERT(!bio_flagged(bio, BIO_CLONED));
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i)
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all)
|
||||
SetPageUptodate(bvec->bv_page);
|
||||
}
|
||||
|
||||
|
12
fs/buffer.c
12
fs/buffer.c
@ -3027,13 +3027,23 @@ void guard_bio_eod(int op, struct bio *bio)
|
||||
/* Uhhuh. We've got a bio that straddles the device size! */
|
||||
truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
|
||||
|
||||
/*
|
||||
* The bio contains more than one segment which spans EOD, just return
|
||||
* and let IO layer turn it into an EIO
|
||||
*/
|
||||
if (truncated_bytes > bvec->bv_len)
|
||||
return;
|
||||
|
||||
/* Truncate the bio.. */
|
||||
bio->bi_iter.bi_size -= truncated_bytes;
|
||||
bvec->bv_len -= truncated_bytes;
|
||||
|
||||
/* ..and clear the end of the buffer for reads */
|
||||
if (op == REQ_OP_READ) {
|
||||
zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
|
||||
struct bio_vec bv;
|
||||
|
||||
mp_bvec_last_segment(bvec, &bv);
|
||||
zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
|
||||
truncated_bytes);
|
||||
}
|
||||
}
|
||||
|
@ -30,8 +30,9 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
|
||||
{
|
||||
struct bio_vec *bv;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bv, bio, i) {
|
||||
bio_for_each_segment_all(bv, bio, i, iter_all) {
|
||||
struct page *page = bv->bv_page;
|
||||
int ret = fscrypt_decrypt_page(page->mapping->host, page,
|
||||
PAGE_SIZE, 0, page->index);
|
||||
|
@ -551,7 +551,9 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
|
||||
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
|
||||
bio_check_pages_dirty(bio); /* transfers ownership */
|
||||
} else {
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
struct page *page = bvec->bv_page;
|
||||
|
||||
if (dio->op == REQ_OP_READ && !PageCompound(page) &&
|
||||
|
@ -420,8 +420,9 @@ static void _clear_bio(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bv;
|
||||
unsigned i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bv, bio, i) {
|
||||
bio_for_each_segment_all(bv, bio, i, iter_all) {
|
||||
unsigned this_count = bv->bv_len;
|
||||
|
||||
if (likely(PAGE_SIZE == this_count))
|
||||
|
@ -468,11 +468,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
|
||||
/* loop on all devices all pages */
|
||||
for (d = 0; d < ios->numdevs; d++) {
|
||||
struct bio *bio = ios->per_dev[d].bio;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
if (!bio)
|
||||
continue;
|
||||
|
||||
bio_for_each_segment_all(bv, bio, i) {
|
||||
bio_for_each_segment_all(bv, bio, i, iter_all) {
|
||||
struct page *page = bv->bv_page;
|
||||
|
||||
SetPageUptodate(page);
|
||||
|
@ -63,8 +63,9 @@ static void ext4_finish_bio(struct bio *bio)
|
||||
{
|
||||
int i;
|
||||
struct bio_vec *bvec;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
struct page *page = bvec->bv_page;
|
||||
#ifdef CONFIG_EXT4_FS_ENCRYPTION
|
||||
struct page *data_page = NULL;
|
||||
|
@ -72,6 +72,7 @@ static void mpage_end_io(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bv;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
if (ext4_bio_encrypted(bio)) {
|
||||
if (bio->bi_status) {
|
||||
@ -81,7 +82,7 @@ static void mpage_end_io(struct bio *bio)
|
||||
return;
|
||||
}
|
||||
}
|
||||
bio_for_each_segment_all(bv, bio, i) {
|
||||
bio_for_each_segment_all(bv, bio, i, iter_all) {
|
||||
struct page *page = bv->bv_page;
|
||||
|
||||
if (!bio->bi_status) {
|
||||
|
@ -87,8 +87,9 @@ static void __read_end_io(struct bio *bio)
|
||||
struct page *page;
|
||||
struct bio_vec *bv;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bv, bio, i) {
|
||||
bio_for_each_segment_all(bv, bio, i, iter_all) {
|
||||
page = bv->bv_page;
|
||||
|
||||
/* PG_error was set if any post_read step failed */
|
||||
@ -164,13 +165,14 @@ static void f2fs_write_end_io(struct bio *bio)
|
||||
struct f2fs_sb_info *sbi = bio->bi_private;
|
||||
struct bio_vec *bvec;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
if (time_to_inject(sbi, FAULT_WRITE_IO)) {
|
||||
f2fs_show_injection_info(FAULT_WRITE_IO);
|
||||
bio->bi_status = BLK_STS_IOERR;
|
||||
}
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
struct page *page = bvec->bv_page;
|
||||
enum count_type type = WB_DATA_TYPE(page);
|
||||
|
||||
@ -347,6 +349,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
|
||||
struct bio_vec *bvec;
|
||||
struct page *target;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
if (!io->bio)
|
||||
return false;
|
||||
@ -354,7 +357,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
|
||||
if (!inode && !page && !ino)
|
||||
return true;
|
||||
|
||||
bio_for_each_segment_all(bvec, io->bio, i) {
|
||||
bio_for_each_segment_all(bvec, io->bio, i, iter_all) {
|
||||
|
||||
if (bvec->bv_page->mapping)
|
||||
target = bvec->bv_page;
|
||||
|
@ -1280,6 +1280,7 @@ const struct file_operations gfs2_file_fops = {
|
||||
.llseek = gfs2_llseek,
|
||||
.read_iter = gfs2_file_read_iter,
|
||||
.write_iter = gfs2_file_write_iter,
|
||||
.iopoll = iomap_dio_iopoll,
|
||||
.unlocked_ioctl = gfs2_ioctl,
|
||||
.mmap = gfs2_mmap,
|
||||
.open = gfs2_open,
|
||||
@ -1310,6 +1311,7 @@ const struct file_operations gfs2_file_fops_nolock = {
|
||||
.llseek = gfs2_llseek,
|
||||
.read_iter = gfs2_file_read_iter,
|
||||
.write_iter = gfs2_file_write_iter,
|
||||
.iopoll = iomap_dio_iopoll,
|
||||
.unlocked_ioctl = gfs2_ioctl,
|
||||
.mmap = gfs2_mmap,
|
||||
.open = gfs2_open,
|
||||
|
@ -168,7 +168,8 @@ u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
|
||||
* that is pinned in the pagecache.
|
||||
*/
|
||||
|
||||
static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
|
||||
static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
|
||||
struct bio_vec *bvec,
|
||||
blk_status_t error)
|
||||
{
|
||||
struct buffer_head *bh, *next;
|
||||
@ -207,6 +208,7 @@ static void gfs2_end_log_write(struct bio *bio)
|
||||
struct bio_vec *bvec;
|
||||
struct page *page;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
if (bio->bi_status) {
|
||||
fs_err(sdp, "Error %d writing to journal, jid=%u\n",
|
||||
@ -214,7 +216,7 @@ static void gfs2_end_log_write(struct bio *bio)
|
||||
wake_up(&sdp->sd_logd_waitq);
|
||||
}
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
page = bvec->bv_page;
|
||||
if (page_has_buffers(page))
|
||||
gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
|
||||
|
@ -190,8 +190,9 @@ static void gfs2_meta_read_endio(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bvec;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all) {
|
||||
struct page *page = bvec->bv_page;
|
||||
struct buffer_head *bh = page_buffers(page);
|
||||
unsigned int len = bvec->bv_len;
|
||||
|
53
fs/iomap.c
53
fs/iomap.c
@ -274,8 +274,9 @@ iomap_read_end_io(struct bio *bio)
|
||||
int error = blk_status_to_errno(bio->bi_status);
|
||||
struct bio_vec *bvec;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i)
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all)
|
||||
iomap_read_page_end_io(bvec, error);
|
||||
bio_put(bio);
|
||||
}
|
||||
@ -324,7 +325,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
||||
*/
|
||||
sector = iomap_sector(iomap, pos);
|
||||
if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
|
||||
if (__bio_try_merge_page(ctx->bio, page, plen, poff))
|
||||
if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
|
||||
goto done;
|
||||
is_contig = true;
|
||||
}
|
||||
@ -355,7 +356,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
||||
ctx->bio->bi_end_io = iomap_read_end_io;
|
||||
}
|
||||
|
||||
__bio_add_page(ctx->bio, page, plen, poff);
|
||||
bio_add_page(ctx->bio, page, plen, poff);
|
||||
done:
|
||||
/*
|
||||
* Move the caller beyond our range so that it keeps making progress.
|
||||
@ -1463,6 +1464,28 @@ struct iomap_dio {
|
||||
};
|
||||
};
|
||||
|
||||
int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
|
||||
{
|
||||
struct request_queue *q = READ_ONCE(kiocb->private);
|
||||
|
||||
if (!q)
|
||||
return 0;
|
||||
return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
|
||||
|
||||
static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
|
||||
struct bio *bio)
|
||||
{
|
||||
atomic_inc(&dio->ref);
|
||||
|
||||
if (dio->iocb->ki_flags & IOCB_HIPRI)
|
||||
bio_set_polled(bio, dio->iocb);
|
||||
|
||||
dio->submit.last_queue = bdev_get_queue(iomap->bdev);
|
||||
dio->submit.cookie = submit_bio(bio);
|
||||
}
|
||||
|
||||
static ssize_t iomap_dio_complete(struct iomap_dio *dio)
|
||||
{
|
||||
struct kiocb *iocb = dio->iocb;
|
||||
@ -1568,14 +1591,15 @@ static void iomap_dio_bio_end_io(struct bio *bio)
|
||||
} else {
|
||||
struct bio_vec *bvec;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i)
|
||||
bio_for_each_segment_all(bvec, bio, i, iter_all)
|
||||
put_page(bvec->bv_page);
|
||||
bio_put(bio);
|
||||
}
|
||||
}
|
||||
|
||||
static blk_qc_t
|
||||
static void
|
||||
iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
|
||||
unsigned len)
|
||||
{
|
||||
@ -1589,15 +1613,10 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
|
||||
bio->bi_private = dio;
|
||||
bio->bi_end_io = iomap_dio_bio_end_io;
|
||||
|
||||
if (dio->iocb->ki_flags & IOCB_HIPRI)
|
||||
flags |= REQ_HIPRI;
|
||||
|
||||
get_page(page);
|
||||
__bio_add_page(bio, page, len, 0);
|
||||
bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
|
||||
|
||||
atomic_inc(&dio->ref);
|
||||
return submit_bio(bio);
|
||||
iomap_dio_submit_bio(dio, iomap, bio);
|
||||
}
|
||||
|
||||
static loff_t
|
||||
@ -1700,9 +1719,6 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
|
||||
bio_set_pages_dirty(bio);
|
||||
}
|
||||
|
||||
if (dio->iocb->ki_flags & IOCB_HIPRI)
|
||||
bio->bi_opf |= REQ_HIPRI;
|
||||
|
||||
iov_iter_advance(dio->submit.iter, n);
|
||||
|
||||
dio->size += n;
|
||||
@ -1710,11 +1726,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
|
||||
copied += n;
|
||||
|
||||
nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
|
||||
|
||||
atomic_inc(&dio->ref);
|
||||
|
||||
dio->submit.last_queue = bdev_get_queue(iomap->bdev);
|
||||
dio->submit.cookie = submit_bio(bio);
|
||||
iomap_dio_submit_bio(dio, iomap, bio);
|
||||
} while (nr_pages);
|
||||
|
||||
/*
|
||||
@ -1925,6 +1937,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
if (dio->flags & IOMAP_DIO_WRITE_FUA)
|
||||
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
|
||||
|
||||
WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
|
||||
WRITE_ONCE(iocb->private, dio->submit.last_queue);
|
||||
|
||||
/*
|
||||
* We are about to drop our additional submission reference, which
|
||||
* might be the last reference to the dio. There are three three
|
||||
|
@ -48,8 +48,9 @@ static void mpage_end_io(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bv;
|
||||
int i;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bv, bio, i) {
|
||||
bio_for_each_segment_all(bv, bio, i, iter_all) {
|
||||
struct page *page = bv->bv_page;
|
||||
page_endio(page, bio_op(bio),
|
||||
blk_status_to_errno(bio->bi_status));
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user