for-5.1/block-20190302

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAlx63XIQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpp2vEACfrrQsap7R+Av28mmXpmXi2FPa3g5Tev1t
 yYjK2qHvhlMZjPTYw3hCmbYdDDczlF7PEgSE2x2DjdcsYapb8Fy1lZ2X16c7ztBR
 HD/t9b5AVSQsczZzKgv3RqsNtTnjzS5V0A8XH8FAP2QRgiwDMwSN6G0FP0JBLbE/
 ZgxQrH1Iy1F33Wz4hI3Z7dEghKPZrH1IlegkZCEu47q9SlWS76qUetSy2GEtchOl
 3Lgu54mQZyVdI5/QZf9DyMDLF6dIz3tYU2qhuo01AHjGRCC72v86p8sIiXcUr94Q
 8pbegJhJ/g8KBol9Qhv3+pWG/QUAZwi/ZwasTkK+MJ4klRXfOrznxPubW1z6t9Vn
 QRo39Po5SqqP0QWAscDxCFjESIQlWlKa+LZurJL7DJDCUGrSgzTpnVwFqKwc5zTP
 HJa5MT2tEeL2TfUYRYCfh0ZV0elINdHA1y1klDBh38drh4EWr2gW8xdseGYXqRjh
 fLgEpoF7VQ8kTvxKN+E4jZXkcZmoLmefp0ZyAbblS6IawpPVC7kXM9Fdn2OU8f2c
 fjVjvSiqxfeN6dnpfeLDRbbN9894HwgP/LPropJOQ7KmjCorQq5zMDkAvoh3tElq
 qwluRqdBJpWT/F05KweY+XVW8OawIycmUWqt6JrVNoIDAK31auHQv47kR0VA4OvE
 DRVVhYpocw==
 =VBaU
 -----END PGP SIGNATURE-----

Merge tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block

Pull block layer updates from Jens Axboe:
 "Not a huge amount of changes in this round, the biggest one is that we
  finally have Mings multi-page bvec support merged. Apart from that,
  this pull request contains:

   - Small series that avoids quiescing the queue for sysfs changes that
     match what we currently have (Aleksei)

   - Series of bcache fixes (via Coly)

   - Series of lightnvm fixes (via Mathias)

   - NVMe pull request from Christoph. Nothing major, just SPDX/license
     cleanups, RR mp policy (Hannes), and little fixes (Bart,
     Chaitanya).

   - BFQ series (Paolo)

   - Save blk-mq cpu -> hw queue mapping, removing a pointer indirection
     for the fast path (Jianchao)

   - fops->iopoll() added for async IO polling, this is a feature that
     the upcoming io_uring interface will use (Christoph, me)

   - Partition scan loop fixes (Dongli)

   - mtip32xx conversion from managed resource API (Christoph)

   - cdrom registration race fix (Guenter)

   - MD pull from Song, two minor fixes.

   - Various documentation fixes (Marcos)

   - Multi-page bvec feature. This brings a lot of nice improvements
     with it, like more efficient splitting, larger IOs can be supported
     without growing the bvec table size, and so on. (Ming)

   - Various little fixes to core and drivers"

* tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block: (117 commits)
  block: fix updating bio's front segment size
  block: Replace function name in string with __func__
  nbd: propagate genlmsg_reply return code
  floppy: remove set but not used variable 'q'
  null_blk: fix checking for REQ_FUA
  block: fix NULL pointer dereference in register_disk
  fs: fix guard_bio_eod to check for real EOD errors
  blk-mq: use HCTX_TYPE_DEFAULT but not 0 to index blk_mq_tag_set->map
  block: optimize bvec iteration in bvec_iter_advance
  block: introduce mp_bvec_for_each_page() for iterating over page
  block: optimize blk_bio_segment_split for single-page bvec
  block: optimize __blk_segment_map_sg() for single-page bvec
  block: introduce bvec_nth_page()
  iomap: wire up the iopoll method
  block: add bio_set_polled() helper
  block: wire up block device iopoll method
  fs: add an iopoll method to struct file_operations
  loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()
  loop: do not print warn message if partition scan is successful
  block: bounce: make sure that bvec table is updated
  ...
This commit is contained in:
Linus Torvalds 2019-03-08 14:12:17 -08:00
commit 80201fe175
114 changed files with 1471 additions and 1124 deletions

View File

@ -117,3 +117,28 @@ Other implications:
size limitations and the limitations of the underlying devices. Thus
there's no need to define ->merge_bvec_fn() callbacks for individual block
drivers.
Usage of helpers:
=================
* The following helpers whose names have the suffix of "_all" can only be used
on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
shouldn't use them because the bio may have been split before it reached the
driver.
bio_for_each_segment_all()
bio_first_bvec_all()
bio_first_page_all()
bio_last_bvec_all()
* The following helpers iterate over single-page segment. The passed 'struct
bio_vec' will contain a single-page IO vector during the iteration
bio_for_each_segment()
bio_for_each_segment_all()
* The following helpers iterate over multi-page bvec. The passed 'struct
bio_vec' will contain a multi-page IO vector during the iteration
bio_for_each_bvec()
rq_for_each_bvec()

View File

@ -857,6 +857,7 @@ struct file_operations {
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iopoll)(struct kiocb *kiocb, bool spin);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
@ -902,6 +903,8 @@ otherwise noted.
write_iter: possibly asynchronous write with iov_iter as source
iopoll: called when aio wants to poll for completions on HIPRI iocbs
iterate: called when the VFS needs to read the directory contents
iterate_shared: called when the VFS needs to read the directory contents

View File

@ -230,11 +230,16 @@ static struct kmem_cache *bfq_pool;
#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
/* hw_tag detection: parallel requests threshold and min samples needed. */
#define BFQ_HW_QUEUE_THRESHOLD 4
#define BFQ_HW_QUEUE_THRESHOLD 3
#define BFQ_HW_QUEUE_SAMPLES 32
#define BFQQ_SEEK_THR (sector_t)(8 * 100)
#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
(get_sdist(last_pos, rq) > \
BFQQ_SEEK_THR && \
(!blk_queue_nonrot(bfqd->queue) || \
blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
@ -623,26 +628,6 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq->pos_root = NULL;
}
/*
* Tell whether there are active queues with different weights or
* active groups.
*/
static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
{
/*
* For queue weights to differ, queue_weights_tree must contain
* at least two nodes.
*/
return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
(bfqd->queue_weights_tree.rb_node->rb_left ||
bfqd->queue_weights_tree.rb_node->rb_right)
#ifdef CONFIG_BFQ_GROUP_IOSCHED
) ||
(bfqd->num_groups_with_pending_reqs > 0
#endif
);
}
/*
* The following function returns true if every queue must receive the
* same share of the throughput (this condition is used when deciding
@ -651,25 +636,48 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
*
* Such a scenario occurs when:
* 1) all active queues have the same weight,
* 2) all active groups at the same level in the groups tree have the same
* weight,
* 2) all active queues belong to the same I/O-priority class,
* 3) all active groups at the same level in the groups tree have the same
* weight,
* 4) all active groups at the same level in the groups tree have the same
* number of children.
*
* Unfortunately, keeping the necessary state for evaluating exactly
* the last two symmetry sub-conditions above would be quite complex
* and time consuming. Therefore this function evaluates, instead,
* only the following stronger two sub-conditions, for which it is
* and time consuming. Therefore this function evaluates, instead,
* only the following stronger three sub-conditions, for which it is
* much easier to maintain the needed state:
* 1) all active queues have the same weight,
* 2) there are no active groups.
* 2) all active queues belong to the same I/O-priority class,
* 3) there are no active groups.
* In particular, the last condition is always true if hierarchical
* support or the cgroups interface are not enabled, thus no state
* needs to be maintained in this case.
*/
static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
{
return !bfq_varied_queue_weights_or_active_groups(bfqd);
/*
* For queue weights to differ, queue_weights_tree must contain
* at least two nodes.
*/
bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
(bfqd->queue_weights_tree.rb_node->rb_left ||
bfqd->queue_weights_tree.rb_node->rb_right);
bool multiple_classes_busy =
(bfqd->busy_queues[0] && bfqd->busy_queues[1]) ||
(bfqd->busy_queues[0] && bfqd->busy_queues[2]) ||
(bfqd->busy_queues[1] && bfqd->busy_queues[2]);
/*
* For queue weights to differ, queue_weights_tree must contain
* at least two nodes.
*/
return !(varied_queue_weights || multiple_classes_busy
#ifdef BFQ_GROUP_IOSCHED_ENABLED
|| bfqd->num_groups_with_pending_reqs > 0
#endif
);
}
/*
@ -728,15 +736,14 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
/*
* In the unlucky event of an allocation failure, we just
* exit. This will cause the weight of queue to not be
* considered in bfq_varied_queue_weights_or_active_groups,
* which, in its turn, causes the scenario to be deemed
* wrongly symmetric in case bfqq's weight would have been
* the only weight making the scenario asymmetric. On the
* bright side, no unbalance will however occur when bfqq
* becomes inactive again (the invocation of this function
* is triggered by an activation of queue). In fact,
* bfq_weights_tree_remove does nothing if
* !bfqq->weight_counter.
* considered in bfq_symmetric_scenario, which, in its turn,
* causes the scenario to be deemed wrongly symmetric in case
* bfqq's weight would have been the only weight making the
* scenario asymmetric. On the bright side, no unbalance will
* however occur when bfqq becomes inactive again (the
* invocation of this function is triggered by an activation
* of queue). In fact, bfq_weights_tree_remove does nothing
* if !bfqq->weight_counter.
*/
if (unlikely(!bfqq->weight_counter))
return;
@ -747,6 +754,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
inc_counter:
bfqq->weight_counter->num_active++;
bfqq->ref++;
}
/*
@ -771,6 +779,7 @@ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
reset_entity_pointer:
bfqq->weight_counter = NULL;
bfq_put_queue(bfqq);
}
/*
@ -782,9 +791,6 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
{
struct bfq_entity *entity = bfqq->entity.parent;
__bfq_weights_tree_remove(bfqd, bfqq,
&bfqd->queue_weights_tree);
for_each_entity(entity) {
struct bfq_sched_data *sd = entity->my_sched_data;
@ -818,6 +824,15 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
bfqd->num_groups_with_pending_reqs--;
}
}
/*
* Next function is invoked last, because it causes bfqq to be
* freed if the following holds: bfqq is not in service and
* has no dispatched request. DO NOT use bfqq after the next
* function invocation.
*/
__bfq_weights_tree_remove(bfqd, bfqq,
&bfqd->queue_weights_tree);
}
/*
@ -873,7 +888,8 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
static unsigned long bfq_serv_to_charge(struct request *rq,
struct bfq_queue *bfqq)
{
if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 ||
!bfq_symmetric_scenario(bfqq->bfqd))
return blk_rq_sectors(rq);
return blk_rq_sectors(rq) * bfq_async_charge_factor;
@ -907,8 +923,10 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
*/
return;
new_budget = max_t(unsigned long, bfqq->max_budget,
bfq_serv_to_charge(next_rq, bfqq));
new_budget = max_t(unsigned long,
max_t(unsigned long, bfqq->max_budget,
bfq_serv_to_charge(next_rq, bfqq)),
entity->service);
if (entity->budget != new_budget) {
entity->budget = new_budget;
bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
@ -1011,7 +1029,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
static int bfqq_process_refs(struct bfq_queue *bfqq)
{
return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
(bfqq->weight_counter != NULL);
}
/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
@ -1380,7 +1399,15 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
{
struct bfq_entity *entity = &bfqq->entity;
if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
/*
* In the next compound condition, we check also whether there
* is some budget left, because otherwise there is no point in
* trying to go on serving bfqq with this same budget: bfqq
* would be expired immediately after being selected for
* service. This would only cause useless overhead.
*/
if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time &&
bfq_bfqq_budget_left(bfqq) > 0) {
/*
* We do not clear the flag non_blocking_wait_rq here, as
* the latter is used in bfq_activate_bfqq to signal
@ -2217,14 +2244,15 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
return NULL;
/* If there is only one backlogged queue, don't search. */
if (bfqd->busy_queues == 1)
if (bfq_tot_busy_queues(bfqd) == 1)
return NULL;
in_service_bfqq = bfqd->in_service_queue;
if (in_service_bfqq && in_service_bfqq != bfqq &&
likely(in_service_bfqq != &bfqd->oom_bfqq) &&
bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
bfq_rq_close_to_sector(io_struct, request,
bfqd->in_serv_last_pos) &&
bfqq->entity.parent == in_service_bfqq->entity.parent &&
bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
@ -2742,7 +2770,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
if ((bfqd->rq_in_driver > 0 ||
now_ns - bfqd->last_completion < BFQ_MIN_TT)
&& get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
&& !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
bfqd->sequential_samples++;
bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
@ -2764,6 +2792,8 @@ update_rate_and_reset:
bfq_update_rate_reset(bfqd, rq);
update_last_values:
bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
if (RQ_BFQQ(rq) == bfqd->in_service_queue)
bfqd->in_serv_last_pos = bfqd->last_position;
bfqd->last_dispatch = now_ns;
}
@ -3274,16 +3304,32 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
* requests, then the request pattern is isochronous
* (see the comments on the function
* bfq_bfqq_softrt_next_start()). Thus we can compute
* soft_rt_next_start. If, instead, the queue still
* has outstanding requests, then we have to wait for
* the completion of all the outstanding requests to
* discover whether the request pattern is actually
* isochronous.
* soft_rt_next_start. And we do it, unless bfqq is in
* interactive weight raising. We do not do it in the
* latter subcase, for the following reason. bfqq may
* be conveying the I/O needed to load a soft
* real-time application. Such an application will
* actually exhibit a soft real-time I/O pattern after
* it finally starts doing its job. But, if
* soft_rt_next_start is computed here for an
* interactive bfqq, and bfqq had received a lot of
* service before remaining with no outstanding
* request (likely to happen on a fast device), then
* soft_rt_next_start would be assigned such a high
* value that, for a very long time, bfqq would be
* prevented from being possibly considered as soft
* real time.
*
* If, instead, the queue still has outstanding
* requests, then we have to wait for the completion
* of all the outstanding requests to discover whether
* the request pattern is actually isochronous.
*/
if (bfqq->dispatched == 0)
if (bfqq->dispatched == 0 &&
bfqq->wr_coeff != bfqd->bfq_wr_coeff)
bfqq->soft_rt_next_start =
bfq_bfqq_softrt_next_start(bfqd, bfqq);
else {
else if (bfqq->dispatched > 0) {
/*
* Schedule an update of soft_rt_next_start to when
* the task may be discovered to be isochronous.
@ -3376,53 +3422,13 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
bfq_bfqq_budget_timeout(bfqq);
}
/*
* For a queue that becomes empty, device idling is allowed only if
* this function returns true for the queue. As a consequence, since
* device idling plays a critical role in both throughput boosting and
* service guarantees, the return value of this function plays a
* critical role in both these aspects as well.
*
* In a nutshell, this function returns true only if idling is
* beneficial for throughput or, even if detrimental for throughput,
* idling is however necessary to preserve service guarantees (low
* latency, desired throughput distribution, ...). In particular, on
* NCQ-capable devices, this function tries to return false, so as to
* help keep the drives' internal queues full, whenever this helps the
* device boost the throughput without causing any service-guarantee
* issue.
*
* In more detail, the return value of this function is obtained by,
* first, computing a number of boolean variables that take into
* account throughput and service-guarantee issues, and, then,
* combining these variables in a logical expression. Most of the
* issues taken into account are not trivial. We discuss these issues
* individually while introducing the variables.
*/
static bool bfq_better_to_idle(struct bfq_queue *bfqq)
static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
struct bfq_data *bfqd = bfqq->bfqd;
bool rot_without_queueing =
!blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
bfqq_sequential_and_IO_bound,
idling_boosts_thr, idling_boosts_thr_without_issues,
idling_needed_for_service_guarantees,
asymmetric_scenario;
if (bfqd->strict_guarantees)
return true;
/*
* Idling is performed only if slice_idle > 0. In addition, we
* do not idle if
* (a) bfqq is async
* (b) bfqq is in the idle io prio class: in this case we do
* not idle because we want to minimize the bandwidth that
* queues in this class can steal to higher-priority queues
*/
if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
bfq_class_idle(bfqq))
return false;
idling_boosts_thr;
bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq);
@ -3454,8 +3460,7 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
bfqq_sequential_and_IO_bound);
/*
* The value of the next variable,
* idling_boosts_thr_without_issues, is equal to that of
* The return value of this function is equal to that of
* idling_boosts_thr, unless a special case holds. In this
* special case, described below, idling may cause problems to
* weight-raised queues.
@ -3472,217 +3477,252 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
* which enqueue several requests in advance, and further
* reorder internally-queued requests.
*
* For this reason, we force to false the value of
* idling_boosts_thr_without_issues if there are weight-raised
* busy queues. In this case, and if bfqq is not weight-raised,
* this guarantees that the device is not idled for bfqq (if,
* instead, bfqq is weight-raised, then idling will be
* guaranteed by another variable, see below). Combined with
* the timestamping rules of BFQ (see [1] for details), this
* behavior causes bfqq, and hence any sync non-weight-raised
* queue, to get a lower number of requests served, and thus
* to ask for a lower number of requests from the request
* pool, before the busy weight-raised queues get served
* again. This often mitigates starvation problems in the
* presence of heavy write workloads and NCQ, thereby
* guaranteeing a higher application and system responsiveness
* in these hostile scenarios.
* For this reason, we force to false the return value if
* there are weight-raised busy queues. In this case, and if
* bfqq is not weight-raised, this guarantees that the device
* is not idled for bfqq (if, instead, bfqq is weight-raised,
* then idling will be guaranteed by another variable, see
* below). Combined with the timestamping rules of BFQ (see
* [1] for details), this behavior causes bfqq, and hence any
* sync non-weight-raised queue, to get a lower number of
* requests served, and thus to ask for a lower number of
* requests from the request pool, before the busy
* weight-raised queues get served again. This often mitigates
* starvation problems in the presence of heavy write
* workloads and NCQ, thereby guaranteeing a higher
* application and system responsiveness in these hostile
* scenarios.
*/
idling_boosts_thr_without_issues = idling_boosts_thr &&
return idling_boosts_thr &&
bfqd->wr_busy_queues == 0;
}
/*
* There is then a case where idling must be performed not
* for throughput concerns, but to preserve service
* guarantees.
*
* To introduce this case, we can note that allowing the drive
* to enqueue more than one request at a time, and hence
* delegating de facto final scheduling decisions to the
* drive's internal scheduler, entails loss of control on the
* actual request service order. In particular, the critical
* situation is when requests from different processes happen
* to be present, at the same time, in the internal queue(s)
* of the drive. In such a situation, the drive, by deciding
* the service order of the internally-queued requests, does
* determine also the actual throughput distribution among
* these processes. But the drive typically has no notion or
* concern about per-process throughput distribution, and
* makes its decisions only on a per-request basis. Therefore,
* the service distribution enforced by the drive's internal
* scheduler is likely to coincide with the desired
* device-throughput distribution only in a completely
* symmetric scenario where:
* (i) each of these processes must get the same throughput as
* the others;
* (ii) the I/O of each process has the same properties, in
* terms of locality (sequential or random), direction
* (reads or writes), request sizes, greediness
* (from I/O-bound to sporadic), and so on.
* In fact, in such a scenario, the drive tends to treat
* the requests of each of these processes in about the same
* way as the requests of the others, and thus to provide
* each of these processes with about the same throughput
* (which is exactly the desired throughput distribution). In
* contrast, in any asymmetric scenario, device idling is
* certainly needed to guarantee that bfqq receives its
* assigned fraction of the device throughput (see [1] for
* details).
* The problem is that idling may significantly reduce
* throughput with certain combinations of types of I/O and
* devices. An important example is sync random I/O, on flash
* storage with command queueing. So, unless bfqq falls in the
* above cases where idling also boosts throughput, it would
* be important to check conditions (i) and (ii) accurately,
* so as to avoid idling when not strictly needed for service
* guarantees.
*
* Unfortunately, it is extremely difficult to thoroughly
* check condition (ii). And, in case there are active groups,
* it becomes very difficult to check condition (i) too. In
* fact, if there are active groups, then, for condition (i)
* to become false, it is enough that an active group contains
* more active processes or sub-groups than some other active
* group. More precisely, for condition (i) to hold because of
* such a group, it is not even necessary that the group is
* (still) active: it is sufficient that, even if the group
* has become inactive, some of its descendant processes still
* have some request already dispatched but still waiting for
* completion. In fact, requests have still to be guaranteed
* their share of the throughput even after being
* dispatched. In this respect, it is easy to show that, if a
* group frequently becomes inactive while still having
* in-flight requests, and if, when this happens, the group is
* not considered in the calculation of whether the scenario
* is asymmetric, then the group may fail to be guaranteed its
* fair share of the throughput (basically because idling may
* not be performed for the descendant processes of the group,
* but it had to be). We address this issue with the
* following bi-modal behavior, implemented in the function
* bfq_symmetric_scenario().
*
* If there are groups with requests waiting for completion
* (as commented above, some of these groups may even be
* already inactive), then the scenario is tagged as
* asymmetric, conservatively, without checking any of the
* conditions (i) and (ii). So the device is idled for bfqq.
* This behavior matches also the fact that groups are created
* exactly if controlling I/O is a primary concern (to
* preserve bandwidth and latency guarantees).
*
* On the opposite end, if there are no groups with requests
* waiting for completion, then only condition (i) is actually
* controlled, i.e., provided that condition (i) holds, idling
* is not performed, regardless of whether condition (ii)
* holds. In other words, only if condition (i) does not hold,
* then idling is allowed, and the device tends to be
* prevented from queueing many requests, possibly of several
* processes. Since there are no groups with requests waiting
* for completion, then, to control condition (i) it is enough
* to check just whether all the queues with requests waiting
* for completion also have the same weight.
*
* Not checking condition (ii) evidently exposes bfqq to the
* risk of getting less throughput than its fair share.
* However, for queues with the same weight, a further
* mechanism, preemption, mitigates or even eliminates this
* problem. And it does so without consequences on overall
* throughput. This mechanism and its benefits are explained
* in the next three paragraphs.
*
* Even if a queue, say Q, is expired when it remains idle, Q
* can still preempt the new in-service queue if the next
* request of Q arrives soon (see the comments on
* bfq_bfqq_update_budg_for_activation). If all queues and
* groups have the same weight, this form of preemption,
* combined with the hole-recovery heuristic described in the
* comments on function bfq_bfqq_update_budg_for_activation,
* are enough to preserve a correct bandwidth distribution in
* the mid term, even without idling. In fact, even if not
* idling allows the internal queues of the device to contain
* many requests, and thus to reorder requests, we can rather
* safely assume that the internal scheduler still preserves a
* minimum of mid-term fairness.
*
* More precisely, this preemption-based, idleless approach
* provides fairness in terms of IOPS, and not sectors per
* second. This can be seen with a simple example. Suppose
* that there are two queues with the same weight, but that
* the first queue receives requests of 8 sectors, while the
* second queue receives requests of 1024 sectors. In
* addition, suppose that each of the two queues contains at
* most one request at a time, which implies that each queue
* always remains idle after it is served. Finally, after
* remaining idle, each queue receives very quickly a new
* request. It follows that the two queues are served
* alternatively, preempting each other if needed. This
* implies that, although both queues have the same weight,
* the queue with large requests receives a service that is
* 1024/8 times as high as the service received by the other
* queue.
*
* The motivation for using preemption instead of idling (for
* queues with the same weight) is that, by not idling,
* service guarantees are preserved (completely or at least in
* part) without minimally sacrificing throughput. And, if
* there is no active group, then the primary expectation for
* this device is probably a high throughput.
*
* We are now left only with explaining the additional
* compound condition that is checked below for deciding
* whether the scenario is asymmetric. To explain this
* compound condition, we need to add that the function
* bfq_symmetric_scenario checks the weights of only
* non-weight-raised queues, for efficiency reasons (see
* comments on bfq_weights_tree_add()). Then the fact that
* bfqq is weight-raised is checked explicitly here. More
* precisely, the compound condition below takes into account
* also the fact that, even if bfqq is being weight-raised,
* the scenario is still symmetric if all queues with requests
* waiting for completion happen to be
* weight-raised. Actually, we should be even more precise
* here, and differentiate between interactive weight raising
* and soft real-time weight raising.
*
* As a side note, it is worth considering that the above
* device-idling countermeasures may however fail in the
* following unlucky scenario: if idling is (correctly)
* disabled in a time period during which all symmetry
* sub-conditions hold, and hence the device is allowed to
* enqueue many requests, but at some later point in time some
* sub-condition stops to hold, then it may become impossible
* to let requests be served in the desired order until all
* the requests already queued in the device have been served.
*/
asymmetric_scenario = (bfqq->wr_coeff > 1 &&
bfqd->wr_busy_queues < bfqd->busy_queues) ||
/*
* There is a case where idling must be performed not for
* throughput concerns, but to preserve service guarantees.
*
* To introduce this case, we can note that allowing the drive
* to enqueue more than one request at a time, and hence
* delegating de facto final scheduling decisions to the
* drive's internal scheduler, entails loss of control on the
* actual request service order. In particular, the critical
* situation is when requests from different processes happen
* to be present, at the same time, in the internal queue(s)
* of the drive. In such a situation, the drive, by deciding
* the service order of the internally-queued requests, does
* determine also the actual throughput distribution among
* these processes. But the drive typically has no notion or
* concern about per-process throughput distribution, and
* makes its decisions only on a per-request basis. Therefore,
* the service distribution enforced by the drive's internal
* scheduler is likely to coincide with the desired
* device-throughput distribution only in a completely
* symmetric scenario where:
* (i) each of these processes must get the same throughput as
* the others;
* (ii) the I/O of each process has the same properties, in
* terms of locality (sequential or random), direction
* (reads or writes), request sizes, greediness
* (from I/O-bound to sporadic), and so on.
* In fact, in such a scenario, the drive tends to treat
* the requests of each of these processes in about the same
* way as the requests of the others, and thus to provide
* each of these processes with about the same throughput
* (which is exactly the desired throughput distribution). In
* contrast, in any asymmetric scenario, device idling is
* certainly needed to guarantee that bfqq receives its
* assigned fraction of the device throughput (see [1] for
* details).
* The problem is that idling may significantly reduce
* throughput with certain combinations of types of I/O and
* devices. An important example is sync random I/O, on flash
* storage with command queueing. So, unless bfqq falls in the
* above cases where idling also boosts throughput, it would
* be important to check conditions (i) and (ii) accurately,
* so as to avoid idling when not strictly needed for service
* guarantees.
*
* Unfortunately, it is extremely difficult to thoroughly
* check condition (ii). And, in case there are active groups,
* it becomes very difficult to check condition (i) too. In
* fact, if there are active groups, then, for condition (i)
* to become false, it is enough that an active group contains
* more active processes or sub-groups than some other active
* group. More precisely, for condition (i) to hold because of
* such a group, it is not even necessary that the group is
* (still) active: it is sufficient that, even if the group
* has become inactive, some of its descendant processes still
* have some request already dispatched but still waiting for
* completion. In fact, requests have still to be guaranteed
* their share of the throughput even after being
* dispatched. In this respect, it is easy to show that, if a
* group frequently becomes inactive while still having
* in-flight requests, and if, when this happens, the group is
* not considered in the calculation of whether the scenario
* is asymmetric, then the group may fail to be guaranteed its
* fair share of the throughput (basically because idling may
* not be performed for the descendant processes of the group,
* but it had to be). We address this issue with the
* following bi-modal behavior, implemented in the function
* bfq_symmetric_scenario().
*
* If there are groups with requests waiting for completion
* (as commented above, some of these groups may even be
* already inactive), then the scenario is tagged as
* asymmetric, conservatively, without checking any of the
* conditions (i) and (ii). So the device is idled for bfqq.
* This behavior matches also the fact that groups are created
* exactly if controlling I/O is a primary concern (to
* preserve bandwidth and latency guarantees).
*
* On the opposite end, if there are no groups with requests
* waiting for completion, then only condition (i) is actually
* controlled, i.e., provided that condition (i) holds, idling
* is not performed, regardless of whether condition (ii)
* holds. In other words, only if condition (i) does not hold,
* then idling is allowed, and the device tends to be
* prevented from queueing many requests, possibly of several
* processes. Since there are no groups with requests waiting
* for completion, then, to control condition (i) it is enough
* to check just whether all the queues with requests waiting
* for completion also have the same weight.
*
* Not checking condition (ii) evidently exposes bfqq to the
* risk of getting less throughput than its fair share.
* However, for queues with the same weight, a further
* mechanism, preemption, mitigates or even eliminates this
* problem. And it does so without consequences on overall
* throughput. This mechanism and its benefits are explained
* in the next three paragraphs.
*
* Even if a queue, say Q, is expired when it remains idle, Q
* can still preempt the new in-service queue if the next
* request of Q arrives soon (see the comments on
* bfq_bfqq_update_budg_for_activation). If all queues and
* groups have the same weight, this form of preemption,
* combined with the hole-recovery heuristic described in the
* comments on function bfq_bfqq_update_budg_for_activation,
* are enough to preserve a correct bandwidth distribution in
* the mid term, even without idling. In fact, even if not
* idling allows the internal queues of the device to contain
* many requests, and thus to reorder requests, we can rather
* safely assume that the internal scheduler still preserves a
* minimum of mid-term fairness.
*
* More precisely, this preemption-based, idleless approach
* provides fairness in terms of IOPS, and not sectors per
* second. This can be seen with a simple example. Suppose
* that there are two queues with the same weight, but that
* the first queue receives requests of 8 sectors, while the
* second queue receives requests of 1024 sectors. In
* addition, suppose that each of the two queues contains at
* most one request at a time, which implies that each queue
* always remains idle after it is served. Finally, after
* remaining idle, each queue receives very quickly a new
* request. It follows that the two queues are served
* alternatively, preempting each other if needed. This
* implies that, although both queues have the same weight,
* the queue with large requests receives a service that is
* 1024/8 times as high as the service received by the other
* queue.
*
* The motivation for using preemption instead of idling (for
* queues with the same weight) is that, by not idling,
* service guarantees are preserved (completely or at least in
* part) without minimally sacrificing throughput. And, if
* there is no active group, then the primary expectation for
* this device is probably a high throughput.
*
* We are now left only with explaining the additional
* compound condition that is checked below for deciding
* whether the scenario is asymmetric. To explain this
* compound condition, we need to add that the function
* bfq_symmetric_scenario checks the weights of only
* non-weight-raised queues, for efficiency reasons (see
* comments on bfq_weights_tree_add()). Then the fact that
* bfqq is weight-raised is checked explicitly here. More
* precisely, the compound condition below takes into account
* also the fact that, even if bfqq is being weight-raised,
* the scenario is still symmetric if all queues with requests
* waiting for completion happen to be
* weight-raised. Actually, we should be even more precise
* here, and differentiate between interactive weight raising
* and soft real-time weight raising.
*
* As a side note, it is worth considering that the above
* device-idling countermeasures may however fail in the
* following unlucky scenario: if idling is (correctly)
* disabled in a time period during which all symmetry
* sub-conditions hold, and hence the device is allowed to
* enqueue many requests, but at some later point in time some
* sub-condition stops to hold, then it may become impossible
* to let requests be served in the desired order until all
* the requests already queued in the device have been served.
*/
static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
return (bfqq->wr_coeff > 1 &&
bfqd->wr_busy_queues <
bfq_tot_busy_queues(bfqd)) ||
!bfq_symmetric_scenario(bfqd);
}
/*
* For a queue that becomes empty, device idling is allowed only if
* this function returns true for that queue. As a consequence, since
* device idling plays a critical role for both throughput boosting
* and service guarantees, the return value of this function plays a
* critical role as well.
*
* In a nutshell, this function returns true only if idling is
* beneficial for throughput or, even if detrimental for throughput,
* idling is however necessary to preserve service guarantees (low
* latency, desired throughput distribution, ...). In particular, on
* NCQ-capable devices, this function tries to return false, so as to
* help keep the drives' internal queues full, whenever this helps the
* device boost the throughput without causing any service-guarantee
* issue.
*
* Most of the issues taken into account to get the return value of
* this function are not trivial. We discuss these issues in the two
* functions providing the main pieces of information needed by this
* function.
*/
static bool bfq_better_to_idle(struct bfq_queue *bfqq)
{
struct bfq_data *bfqd = bfqq->bfqd;
bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar;
if (unlikely(bfqd->strict_guarantees))
return true;
/*
* Finally, there is a case where maximizing throughput is the
* best choice even if it may cause unfairness toward
* bfqq. Such a case is when bfqq became active in a burst of
* queue activations. Queues that became active during a large
* burst benefit only from throughput, as discussed in the
* comments on bfq_handle_burst. Thus, if bfqq became active
* in a burst and not idling the device maximizes throughput,
* then the device must no be idled, because not idling the
* device provides bfqq and all other queues in the burst with
* maximum benefit. Combining this and the above case, we can
* now establish when idling is actually needed to preserve
* service guarantees.
* Idling is performed only if slice_idle > 0. In addition, we
* do not idle if
* (a) bfqq is async
* (b) bfqq is in the idle io prio class: in this case we do
* not idle because we want to minimize the bandwidth that
* queues in this class can steal to higher-priority queues
*/
idling_needed_for_service_guarantees =
asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
bfq_class_idle(bfqq))
return false;
idling_boosts_thr_with_no_issue =
idling_boosts_thr_without_issues(bfqd, bfqq);
idling_needed_for_service_guar =
idling_needed_for_service_guarantees(bfqd, bfqq);
/*
* We have now all the components we need to compute the
* We have now the two components we need to compute the
* return value of the function, which is true only if idling
* either boosts the throughput (without issues), or is
* necessary to preserve service guarantees.
*/
return idling_boosts_thr_without_issues ||
idling_needed_for_service_guarantees;
return idling_boosts_thr_with_no_issue ||
idling_needed_for_service_guar;
}
/*
@ -3934,7 +3974,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
* belongs to CLASS_IDLE and other queues are waiting for
* service.
*/
if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
goto return_rq;
bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
@ -3952,7 +3992,7 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
* most a call to dispatch for nothing
*/
return !list_empty_careful(&bfqd->dispatch) ||
bfqd->busy_queues > 0;
bfq_tot_busy_queues(bfqd) > 0;
}
static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
@ -4006,9 +4046,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
goto start_rq;
}
bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
bfq_log(bfqd, "dispatch requests: %d busy queues",
bfq_tot_busy_queues(bfqd));
if (bfqd->busy_queues == 0)
if (bfq_tot_busy_queues(bfqd) == 0)
goto exit;
/*
@ -4488,10 +4529,7 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct request *rq)
{
bfqq->seek_history <<= 1;
bfqq->seek_history |=
get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
(!blk_queue_nonrot(bfqd->queue) ||
blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);
}
static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
@ -4560,28 +4598,31 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
/*
* There is just this request queued: if the request
* is small and the queue is not to be expired, then
* just exit.
* There is just this request queued: if
* - the request is small, and
* - we are idling to boost throughput, and
* - the queue is not to be expired,
* then just exit.
*
* In this way, if the device is being idled to wait
* for a new request from the in-service queue, we
* avoid unplugging the device and committing the
* device to serve just a small request. On the
* contrary, we wait for the block layer to decide
* when to unplug the device: hopefully, new requests
* will be merged to this one quickly, then the device
* will be unplugged and larger requests will be
* dispatched.
* device to serve just a small request. In contrast
* we wait for the block layer to decide when to
* unplug the device: hopefully, new requests will be
* merged to this one quickly, then the device will be
* unplugged and larger requests will be dispatched.
*/
if (small_req && !budget_timeout)
if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) &&
!budget_timeout)
return;
/*
* A large enough request arrived, or the queue is to
* be expired: in both cases disk idling is to be
* stopped, so clear wait_request flag and reset
* timer.
* A large enough request arrived, or idling is being
* performed to preserve service guarantees, or
* finally the queue is to be expired: in all these
* cases disk idling is to be stopped, so clear
* wait_request flag and reset timer.
*/
bfq_clear_bfqq_wait_request(bfqq);
hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
@ -4607,8 +4648,6 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bool waiting, idle_timer_disabled = false;
if (new_bfqq) {
if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
/*
* Release the request's reference to the old bfqq
* and make sure one is taken to the shared queue.
@ -4751,6 +4790,8 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
static void bfq_update_hw_tag(struct bfq_data *bfqd)
{
struct bfq_queue *bfqq = bfqd->in_service_queue;
bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
bfqd->rq_in_driver);
@ -4763,7 +4804,18 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
* sum is not exact, as it's not taking into account deactivated
* requests.
*/
if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
return;
/*
* If active queue hasn't enough requests and can idle, bfq might not
* dispatch sufficient requests to hardware. Don't zero hw_tag in this
* case
*/
if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
BFQ_HW_QUEUE_THRESHOLD &&
bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
return;
if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
@ -4834,11 +4886,14 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
* isochronous, and both requisites for this condition to hold
* are now satisfied, then compute soft_rt_next_start (see the
* comments on the function bfq_bfqq_softrt_next_start()). We
* schedule this delayed check when bfqq expires, if it still
* has in-flight requests.
* do not compute soft_rt_next_start if bfqq is in interactive
* weight raising (see the comments in bfq_bfqq_expire() for
* an explanation). We schedule this delayed update when bfqq
* expires, if it still has in-flight requests.
*/
if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
RB_EMPTY_ROOT(&bfqq->sort_list))
RB_EMPTY_ROOT(&bfqq->sort_list) &&
bfqq->wr_coeff != bfqd->bfq_wr_coeff)
bfqq->soft_rt_next_start =
bfq_bfqq_softrt_next_start(bfqd, bfqq);

View File

@ -501,10 +501,11 @@ struct bfq_data {
unsigned int num_groups_with_pending_reqs;
/*
* Number of bfq_queues containing requests (including the
* queue in service, even if it is idling).
* Per-class (RT, BE, IDLE) number of bfq_queues containing
* requests (including the queue in service, even if it is
* idling).
*/
int busy_queues;
unsigned int busy_queues[3];
/* number of weight-raised busy @bfq_queues */
int wr_busy_queues;
/* number of queued requests */
@ -537,6 +538,9 @@ struct bfq_data {
/* on-disk position of the last served request */
sector_t last_position;
/* position of the last served request for the in-service queue */
sector_t in_serv_last_pos;
/* time of last request completion (ns) */
u64 last_completion;
@ -974,6 +978,7 @@ extern struct blkcg_policy blkcg_policy_bfq;
struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
struct bfq_entity *bfq_entity_of(struct rb_node *node);
unsigned short bfq_ioprio_to_weight(int ioprio);

View File

@ -44,6 +44,12 @@ static unsigned int bfq_class_idx(struct bfq_entity *entity)
BFQ_DEFAULT_GRP_CLASS - 1;
}
unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd)
{
return bfqd->busy_queues[0] + bfqd->busy_queues[1] +
bfqd->busy_queues[2];
}
static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
bool expiration);
@ -1513,7 +1519,7 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
struct bfq_sched_data *sd;
struct bfq_queue *bfqq;
if (bfqd->busy_queues == 0)
if (bfq_tot_busy_queues(bfqd) == 0)
return NULL;
/*
@ -1665,10 +1671,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_clear_bfqq_busy(bfqq);
bfqd->busy_queues--;
if (!bfqq->dispatched)
bfq_weights_tree_remove(bfqd, bfqq);
bfqd->busy_queues[bfqq->ioprio_class - 1]--;
if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues--;
@ -1676,6 +1679,9 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqg_stats_update_dequeue(bfqq_group(bfqq));
bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
if (!bfqq->dispatched)
bfq_weights_tree_remove(bfqd, bfqq);
}
/*
@ -1688,7 +1694,7 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_activate_bfqq(bfqd, bfqq);
bfq_mark_bfqq_busy(bfqq);
bfqd->busy_queues++;
bfqd->busy_queues[bfqq->ioprio_class - 1]++;
if (!bfqq->dispatched)
if (bfqq->wr_coeff == 1)

View File

@ -753,6 +753,8 @@ EXPORT_SYMBOL(bio_add_pc_page);
* @page: page to add
* @len: length of the data to add
* @off: offset of the data in @page
* @same_page: if %true only merge if the new data is in the same physical
* page as the last segment of the bio.
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* a useful optimisation for file systems with a block size smaller than the
@ -761,19 +763,25 @@ EXPORT_SYMBOL(bio_add_pc_page);
* Return %true on success or %false on failure.
*/
bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off)
unsigned int len, unsigned int off, bool same_page)
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
bv->bv_offset + bv->bv_len - 1;
phys_addr_t page_addr = page_to_phys(page);
if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
}
if (vec_end_addr + 1 != page_addr + off)
return false;
if (same_page && (vec_end_addr & PAGE_MASK) != page_addr)
return false;
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
}
return false;
}
@ -819,7 +827,7 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
int bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
if (!__bio_try_merge_page(bio, page, len, offset)) {
if (!__bio_try_merge_page(bio, page, len, offset, false)) {
if (bio_full(bio))
return 0;
__bio_add_page(bio, page, len, offset);
@ -1072,8 +1080,9 @@ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
{
int i;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret;
ret = copy_page_from_iter(bvec->bv_page,
@ -1103,8 +1112,9 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
{
int i;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret;
ret = copy_page_to_iter(bvec->bv_page,
@ -1126,8 +1136,9 @@ void bio_free_pages(struct bio *bio)
{
struct bio_vec *bvec;
int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i)
bio_for_each_segment_all(bvec, bio, i, iter_all)
__free_page(bvec->bv_page);
}
EXPORT_SYMBOL(bio_free_pages);
@ -1295,6 +1306,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
struct bio *bio;
int ret;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
if (!iov_iter_count(iter))
return ERR_PTR(-EINVAL);
@ -1368,7 +1380,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
return bio;
out_unmap:
bio_for_each_segment_all(bvec, bio, j) {
bio_for_each_segment_all(bvec, bio, j, iter_all) {
put_page(bvec->bv_page);
}
bio_put(bio);
@ -1379,11 +1391,12 @@ static void __bio_unmap_user(struct bio *bio)
{
struct bio_vec *bvec;
int i;
struct bvec_iter_all iter_all;
/*
* make sure we dirty pages we wrote to
*/
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (bio_data_dir(bio) == READ)
set_page_dirty_lock(bvec->bv_page);
@ -1475,8 +1488,9 @@ static void bio_copy_kern_endio_read(struct bio *bio)
char *p = bio->bi_private;
struct bio_vec *bvec;
int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
p += bvec->bv_len;
}
@ -1585,8 +1599,9 @@ void bio_set_pages_dirty(struct bio *bio)
{
struct bio_vec *bvec;
int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (!PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
}
@ -1596,8 +1611,9 @@ static void bio_release_pages(struct bio *bio)
{
struct bio_vec *bvec;
int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i)
bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page);
}
@ -1644,8 +1660,9 @@ void bio_check_pages_dirty(struct bio *bio)
struct bio_vec *bvec;
unsigned long flags;
int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
goto defer;
}

View File

@ -1269,7 +1269,7 @@ void blkcg_drain_queue(struct request_queue *q)
* blkcg_exit_queue - exit and release blkcg part of request_queue
* @q: request_queue being released
*
* Called from blk_release_queue(). Responsible for exiting blkcg part.
* Called from blk_exit_queue(). Responsible for exiting blkcg part.
*/
void blkcg_exit_queue(struct request_queue *q)
{

View File

@ -161,6 +161,73 @@ static inline unsigned get_max_io_size(struct request_queue *q,
return sectors;
}
static unsigned get_max_segment_size(struct request_queue *q,
unsigned offset)
{
unsigned long mask = queue_segment_boundary(q);
/* default segment boundary mask means no boundary limit */
if (mask == BLK_SEG_BOUNDARY_MASK)
return queue_max_segment_size(q);
return min_t(unsigned long, mask - (mask & offset) + 1,
queue_max_segment_size(q));
}
/*
* Split the bvec @bv into segments, and update all kinds of
* variables.
*/
static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
unsigned *nsegs, unsigned *last_seg_size,
unsigned *front_seg_size, unsigned *sectors)
{
unsigned len = bv->bv_len;
unsigned total_len = 0;
unsigned new_nsegs = 0, seg_size = 0;
/*
* Multi-page bvec may be too big to hold in one segment, so the
* current bvec has to be splitted as multiple segments.
*/
while (len && new_nsegs + *nsegs < queue_max_segments(q)) {
seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
seg_size = min(seg_size, len);
new_nsegs++;
total_len += seg_size;
len -= seg_size;
if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
break;
}
if (!new_nsegs)
return !!len;
/* update front segment size */
if (!*nsegs) {
unsigned first_seg_size;
if (new_nsegs == 1)
first_seg_size = get_max_segment_size(q, bv->bv_offset);
else
first_seg_size = queue_max_segment_size(q);
if (*front_seg_size < first_seg_size)
*front_seg_size = first_seg_size;
}
/* update other varibles */
*last_seg_size = seg_size;
*nsegs += new_nsegs;
if (sectors)
*sectors += total_len >> 9;
/* split in the middle of the bvec if len != 0 */
return !!len;
}
static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
@ -174,7 +241,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *new = NULL;
const unsigned max_sectors = get_max_io_size(q, bio);
bio_for_each_segment(bv, bio, iter) {
bio_for_each_bvec(bv, bio, iter) {
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
@ -189,8 +256,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
*/
if (nsegs < queue_max_segments(q) &&
sectors < max_sectors) {
nsegs++;
sectors = max_sectors;
/* split in the middle of bvec */
bv.bv_len = (max_sectors - sectors) << 9;
bvec_split_segs(q, &bv, &nsegs,
&seg_size,
&front_seg_size,
&sectors);
}
goto split;
}
@ -206,21 +277,28 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
bvprvp = &bvprv;
sectors += bv.bv_len >> 9;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
continue;
}
new_segment:
if (nsegs == queue_max_segments(q))
goto split;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
nsegs++;
bvprv = bv;
bvprvp = &bvprv;
seg_size = bv.bv_len;
sectors += bv.bv_len >> 9;
if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
nsegs++;
seg_size = bv.bv_len;
sectors += bv.bv_len >> 9;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
} else if (bvec_split_segs(q, &bv, &nsegs, &seg_size,
&front_seg_size, &sectors)) {
goto split;
}
}
do_split = false;
@ -233,8 +311,6 @@ split:
bio = new;
}
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
bio->bi_seg_front_size = front_seg_size;
if (seg_size > bio->bi_seg_back_size)
bio->bi_seg_back_size = seg_size;
@ -291,18 +367,20 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
EXPORT_SYMBOL(blk_queue_split);
static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
struct bio *bio,
bool no_sg_merge)
struct bio *bio)
{
struct bio_vec bv, bvprv = { NULL };
int prev = 0;
unsigned int seg_size, nr_phys_segs;
unsigned front_seg_size;
struct bio *fbio, *bbio;
struct bvec_iter iter;
if (!bio)
return 0;
front_seg_size = bio->bi_seg_front_size;
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
@ -316,14 +394,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
seg_size = 0;
nr_phys_segs = 0;
for_each_bio(bio) {
bio_for_each_segment(bv, bio, iter) {
/*
* If SG merging is disabled, each bio vector is
* a segment
*/
if (no_sg_merge)
goto new_segment;
bio_for_each_bvec(bv, bio, iter) {
if (prev) {
if (seg_size + bv.bv_len
> queue_max_segment_size(q))
@ -333,23 +404,23 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
seg_size += bv.bv_len;
bvprv = bv;
if (nr_phys_segs == 1 && seg_size >
front_seg_size)
front_seg_size = seg_size;
continue;
}
new_segment:
if (nr_phys_segs == 1 && seg_size >
fbio->bi_seg_front_size)
fbio->bi_seg_front_size = seg_size;
nr_phys_segs++;
bvprv = bv;
prev = 1;
seg_size = bv.bv_len;
bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size,
&front_seg_size, NULL);
}
bbio = bio;
}
if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size)
fbio->bi_seg_front_size = seg_size;
fbio->bi_seg_front_size = front_seg_size;
if (seg_size > bbio->bi_seg_back_size)
bbio->bi_seg_back_size = seg_size;
@ -358,33 +429,16 @@ new_segment:
void blk_recalc_rq_segments(struct request *rq)
{
bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
&rq->q->queue_flags);
rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
no_sg_merge);
rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
}
void blk_recount_segments(struct request_queue *q, struct bio *bio)
{
unsigned short seg_cnt;
struct bio *nxt = bio->bi_next;
/* estimate segment number by bi_vcnt for non-cloned bio */
if (bio_flagged(bio, BIO_CLONED))
seg_cnt = bio_segments(bio);
else
seg_cnt = bio->bi_vcnt;
if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
(seg_cnt < queue_max_segments(q)))
bio->bi_phys_segments = seg_cnt;
else {
struct bio *nxt = bio->bi_next;
bio->bi_next = NULL;
bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
bio->bi_next = nxt;
}
bio->bi_next = NULL;
bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
bio->bi_next = nxt;
bio_set_flag(bio, BIO_SEG_VALID);
}
@ -407,6 +461,54 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
}
static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
struct scatterlist *sglist)
{
if (!*sg)
return sglist;
/*
* If the driver previously mapped a shorter list, we could see a
* termination bit prematurely unless it fully inits the sg table
* on each mapping. We KNOW that there must be more entries here
* or the driver would be buggy, so force clear the termination bit
* to avoid doing a full sg_init_table() in drivers for each command.
*/
sg_unmark_end(*sg);
return sg_next(*sg);
}
static unsigned blk_bvec_map_sg(struct request_queue *q,
struct bio_vec *bvec, struct scatterlist *sglist,
struct scatterlist **sg)
{
unsigned nbytes = bvec->bv_len;
unsigned nsegs = 0, total = 0, offset = 0;
while (nbytes > 0) {
unsigned seg_size;
struct page *pg;
unsigned idx;
*sg = blk_next_sg(sg, sglist);
seg_size = get_max_segment_size(q, bvec->bv_offset + total);
seg_size = min(nbytes, seg_size);
offset = (total + bvec->bv_offset) % PAGE_SIZE;
idx = (total + bvec->bv_offset) / PAGE_SIZE;
pg = bvec_nth_page(bvec->bv_page, idx);
sg_set_page(*sg, pg, seg_size, offset);
total += seg_size;
nbytes -= seg_size;
nsegs++;
}
return nsegs;
}
static inline void
__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
struct scatterlist *sglist, struct bio_vec *bvprv,
@ -424,25 +526,12 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
(*sg)->length += nbytes;
} else {
new_segment:
if (!*sg)
*sg = sglist;
else {
/*
* If the driver previously mapped a shorter
* list, we could see a termination bit
* prematurely unless it fully inits the sg
* table on each mapping. We KNOW that there
* must be more entries here or the driver
* would be buggy, so force clear the
* termination bit to avoid doing a full
* sg_init_table() in drivers for each command.
*/
sg_unmark_end(*sg);
*sg = sg_next(*sg);
}
sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
(*nsegs)++;
if (bvec->bv_offset + bvec->bv_len <= PAGE_SIZE) {
*sg = blk_next_sg(sg, sglist);
sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
(*nsegs) += 1;
} else
(*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg);
}
*bvprv = *bvec;
}
@ -464,7 +553,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
int nsegs = 0;
for_each_bio(bio)
bio_for_each_segment(bvec, bio, iter)
bio_for_each_bvec(bvec, bio, iter)
__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
&nsegs);

View File

@ -128,11 +128,9 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(DEAD),
QUEUE_FLAG_NAME(INIT_DONE),
QUEUE_FLAG_NAME(NO_SG_MERGE),
QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(WC),
QUEUE_FLAG_NAME(FUA),
QUEUE_FLAG_NAME(FLUSH_NQ),
QUEUE_FLAG_NAME(DAX),
QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(POLL_STATS),
@ -251,7 +249,6 @@ static const char *const alloc_policy_name[] = {
static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(SHOULD_MERGE),
HCTX_FLAG_NAME(TAG_SHARED),
HCTX_FLAG_NAME(SG_MERGE),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
};

View File

@ -321,7 +321,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
{
struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu);
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
bool ret = false;
enum hctx_type type;

View File

@ -170,7 +170,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
data->ctx = blk_mq_get_ctx(data->q);
data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
data->ctx->cpu);
data->ctx);
tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED)
bt = &tags->breserved_tags;

View File

@ -364,7 +364,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
}
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->cmd_flags,
data->ctx->cpu);
data->ctx);
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
@ -2069,7 +2069,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags;
int node;
node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
@ -2125,7 +2125,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
size_t rq_size, left;
int node;
node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
@ -2424,7 +2424,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
* If the cpu isn't present, the cpu is mapped to first hctx.
*/
for_each_possible_cpu(i) {
hctx_idx = set->map[0].mq_map[i];
hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
if (!set->tags[hctx_idx] &&
!__blk_mq_alloc_rq_map(set, hctx_idx)) {
@ -2434,16 +2434,19 @@ static void blk_mq_map_swqueue(struct request_queue *q)
* case, remap the current ctx to hctx[0] which
* is guaranteed to always have tags allocated
*/
set->map[0].mq_map[i] = 0;
set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
}
ctx = per_cpu_ptr(q->queue_ctx, i);
for (j = 0; j < set->nr_maps; j++) {
if (!set->map[j].nr_queues)
if (!set->map[j].nr_queues) {
ctx->hctxs[j] = blk_mq_map_queue_type(q,
HCTX_TYPE_DEFAULT, i);
continue;
}
hctx = blk_mq_map_queue_type(q, j, i);
ctx->hctxs[j] = hctx;
/*
* If the CPU is already set in the mask, then we've
* mapped this one already. This can happen if
@ -2463,6 +2466,10 @@ static void blk_mq_map_swqueue(struct request_queue *q)
*/
BUG_ON(!hctx->nr_ctx);
}
for (; j < HCTX_MAX_TYPES; j++)
ctx->hctxs[j] = blk_mq_map_queue_type(q,
HCTX_TYPE_DEFAULT, i);
}
mutex_unlock(&q->sysfs_lock);
@ -2734,7 +2741,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
int node;
struct blk_mq_hw_ctx *hctx;
node = blk_mq_hw_queue_to_node(&set->map[0], i);
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
/*
* If the hw queue has been mapped to another numa node,
* we need to realloc the hctx. If allocation fails, fallback
@ -2838,9 +2845,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
set->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
if (!(set->flags & BLK_MQ_F_SG_MERGE))
blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
q->sg_reserved_size = INT_MAX;
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
@ -2968,7 +2972,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
return set->ops->map_queues(set);
} else {
BUG_ON(set->nr_maps > 1);
return blk_mq_map_queues(&set->map[0]);
return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
}
}
@ -3090,6 +3094,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
if (!set)
return -EINVAL;
if (q->nr_requests == nr)
return 0;
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
@ -3235,7 +3242,7 @@ fallback:
pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
nr_hw_queues, prev_nr_hw_queues);
set->nr_hw_queues = prev_nr_hw_queues;
blk_mq_map_queues(&set->map[0]);
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
goto fallback;
}
blk_mq_map_swqueue(q);

View File

@ -23,6 +23,7 @@ struct blk_mq_ctx {
unsigned int cpu;
unsigned short index_hw[HCTX_MAX_TYPES];
struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES];
/* incremented at dispatch time */
unsigned long rq_dispatched[2];
@ -96,26 +97,23 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
* @q: request queue
* @flags: request command flags
* @cpu: CPU
* @cpu: cpu ctx
*/
static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
unsigned int flags,
unsigned int cpu)
struct blk_mq_ctx *ctx)
{
enum hctx_type type = HCTX_TYPE_DEFAULT;
if ((flags & REQ_HIPRI) &&
q->tag_set->nr_maps > HCTX_TYPE_POLL &&
q->tag_set->map[HCTX_TYPE_POLL].nr_queues &&
test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
/*
* The caller ensure that if REQ_HIPRI, poll must be enabled.
*/
if (flags & REQ_HIPRI)
type = HCTX_TYPE_POLL;
else if (((flags & REQ_OP_MASK) == REQ_OP_READ) &&
q->tag_set->nr_maps > HCTX_TYPE_READ &&
q->tag_set->map[HCTX_TYPE_READ].nr_queues)
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
return blk_mq_map_queue_type(q, type, cpu);
return ctx->hctxs[type];
}
/*

View File

@ -799,15 +799,6 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
}
EXPORT_SYMBOL(blk_queue_update_dma_alignment);
void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
{
if (queueable)
blk_queue_flag_clear(QUEUE_FLAG_FLUSH_NQ, q);
else
blk_queue_flag_set(QUEUE_FLAG_FLUSH_NQ, q);
}
EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
/**
* blk_set_queue_depth - tell the block layer about the device queue depth
* @q: the request queue for the device

View File

@ -468,6 +468,9 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
else if (val >= 0)
val *= 1000ULL;
if (wbt_get_min_lat(q) == val)
return count;
/*
* Ensure that the queue is idled, in case the latency update
* ends up either enabling or disabling wbt completely. We can't
@ -817,21 +820,16 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
}
/**
* __blk_release_queue - release a request queue when it is no longer needed
* __blk_release_queue - release a request queue
* @work: pointer to the release_work member of the request queue to be released
*
* Description:
* blk_release_queue is the counterpart of blk_init_queue(). It should be
* called when a request queue is being released; typically when a block
* device is being de-registered. Its primary task it to free the queue
* itself.
*
* Notes:
* The low level driver must have finished any outstanding requests first
* via blk_cleanup_queue().
*
* Although blk_release_queue() may be called with preemption disabled,
* __blk_release_queue() may sleep.
* This function is called when a block device is being unregistered. The
* process of releasing a request queue starts with blk_cleanup_queue, which
* set the appropriate flags and then calls blk_put_queue, that decrements
* the reference counter of the request queue. Once the reference counter
* of the request queue reaches zero, blk_release_queue is called to release
* all allocated resources of the request queue.
*/
static void __blk_release_queue(struct work_struct *work)
{

View File

@ -38,7 +38,7 @@ extern struct ida blk_queue_ida;
static inline struct blk_flush_queue *
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
{
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx->cpu)->fq;
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
}
static inline void __blk_get_queue(struct request_queue *q)

View File

@ -165,11 +165,12 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
struct bio_vec *bvec, orig_vec;
int i;
struct bvec_iter orig_iter = bio_orig->bi_iter;
struct bvec_iter_all iter_all;
/*
* free up bounce indirect pages used
*/
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
orig_vec = bio_iter_iovec(bio_orig, orig_iter);
if (bvec->bv_page != orig_vec.bv_page) {
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
@ -313,7 +314,12 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
&bounce_bio_set);
bio_for_each_segment_all(to, bio, i) {
/*
* Bvec table can't be updated by bio_for_each_segment_all(),
* so retrieve bvec from the table directly. This way is safe
* because the 'bio' is single-page bvec.
*/
for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
struct page *page = to->bv_page;
if (page_to_pfn(page) <= q->limits.bounce_pfn)

View File

@ -667,8 +667,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
/*
* Special case for mq, turn off scheduling
*/
if (!strncmp(name, "none", 4))
if (!strncmp(name, "none", 4)) {
if (!q->elevator)
return 0;
return elevator_switch(q, NULL);
}
strlcpy(elevator_name, name, sizeof(elevator_name));
e = elevator_get(q, strstrip(elevator_name), true);

View File

@ -365,8 +365,8 @@ int register_blkdev(unsigned int major, const char *name)
}
if (index == 0) {
printk("register_blkdev: failed to get major for %s\n",
name);
printk("%s: failed to get major for %s\n",
__func__, name);
ret = -EBUSY;
goto out;
}
@ -375,8 +375,8 @@ int register_blkdev(unsigned int major, const char *name)
}
if (major >= BLKDEV_MAJOR_MAX) {
pr_err("register_blkdev: major requested (%u) is greater than the maximum (%u) for %s\n",
major, BLKDEV_MAJOR_MAX-1, name);
pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
__func__, major, BLKDEV_MAJOR_MAX-1, name);
ret = -EINVAL;
goto out;
@ -655,10 +655,12 @@ exit:
kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
disk_part_iter_exit(&piter);
err = sysfs_create_link(&ddev->kobj,
&disk->queue->backing_dev_info->dev->kobj,
"bdi");
WARN_ON(err);
if (disk->queue->backing_dev_info->dev) {
err = sysfs_create_link(&ddev->kobj,
&disk->queue->backing_dev_info->dev->kobj,
"bdi");
WARN_ON(err);
}
}
/**

View File

@ -1318,8 +1318,6 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
scsi_change_queue_depth(sdev, depth);
}
blk_queue_flush_queueable(q, false);
if (dev->flags & ATA_DFLAG_TRUSTED)
sdev->security_supported = 1;

View File

@ -2230,7 +2230,6 @@ static void floppy_end_request(struct request *req, blk_status_t error)
static void request_done(int uptodate)
{
struct request *req = current_req;
struct request_queue *q;
int block;
char msg[sizeof("request done ") + sizeof(int) * 3];
@ -2243,8 +2242,6 @@ static void request_done(int uptodate)
return;
}
q = req->q;
if (uptodate) {
/* maintain values for invalidation on geometry
* change */

View File

@ -511,21 +511,22 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
loff_t pos, bool rw)
{
struct iov_iter iter;
struct req_iterator rq_iter;
struct bio_vec *bvec;
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct bio *bio = rq->bio;
struct file *file = lo->lo_backing_file;
struct bio_vec tmp;
unsigned int offset;
int segments = 0;
int nr_bvec = 0;
int ret;
if (rq->bio != rq->biotail) {
struct req_iterator iter;
struct bio_vec tmp;
rq_for_each_bvec(tmp, rq, rq_iter)
nr_bvec++;
__rq_for_each_bio(bio, rq)
segments += bio_segments(bio);
bvec = kmalloc_array(segments, sizeof(struct bio_vec),
if (rq->bio != rq->biotail) {
bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
GFP_NOIO);
if (!bvec)
return -EIO;
@ -534,10 +535,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
/*
* The bios of the request may be started from the middle of
* the 'bvec' because of bio splitting, so we can't directly
* copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
* copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
* API will take care of all details for us.
*/
rq_for_each_segment(tmp, rq, iter) {
rq_for_each_bvec(tmp, rq, rq_iter) {
*bvec = tmp;
bvec++;
}
@ -551,11 +552,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
*/
offset = bio->bi_iter.bi_bvec_done;
bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
segments = bio_segments(bio);
}
atomic_set(&cmd->ref, 2);
iov_iter_bvec(&iter, rw, bvec, segments, blk_rq_bytes(rq));
iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
iter.iov_offset = offset;
cmd->iocb.ki_pos = pos;
@ -1089,16 +1089,12 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
}
mapping_set_gfp_mask(filp->f_mapping, gfp);
lo->lo_state = Lo_unbound;
/* This is safe: open() is still holding a reference. */
module_put(THIS_MODULE);
blk_mq_unfreeze_queue(lo->lo_queue);
partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
lo_number = lo->lo_number;
lo->lo_flags = 0;
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
loop_unprepare_queue(lo);
out_unlock:
mutex_unlock(&loop_ctl_mutex);
@ -1115,11 +1111,29 @@ out_unlock:
err = __blkdev_reread_part(bdev);
else
err = blkdev_reread_part(bdev);
pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
__func__, lo_number, err);
if (err)
pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
__func__, lo_number, err);
/* Device is gone, no point in returning error */
err = 0;
}
/*
* lo->lo_state is set to Lo_unbound here after above partscan has
* finished.
*
* There cannot be anybody else entering __loop_clr_fd() as
* lo->lo_backing_file is already cleared and Lo_rundown state
* protects us from all the other places trying to change the 'lo'
* device.
*/
mutex_lock(&loop_ctl_mutex);
lo->lo_flags = 0;
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
lo->lo_state = Lo_unbound;
mutex_unlock(&loop_ctl_mutex);
/*
* Need not hold loop_ctl_mutex to fput backing file.
* Calling fput holding loop_ctl_mutex triggers a circular
@ -1937,7 +1951,7 @@ static int loop_add(struct loop_device **l, int i)
lo->tag_set.queue_depth = 128;
lo->tag_set.numa_node = NUMA_NO_NODE;
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
lo->tag_set.driver_data = lo;
err = blk_mq_alloc_tag_set(&lo->tag_set);

View File

@ -1416,7 +1416,7 @@ static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE);
/* Allocate a DMA buffer for the trim structure */
buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
buf = dma_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
GFP_KERNEL);
if (!buf)
return BLK_STS_RESOURCE;
@ -1453,7 +1453,7 @@ static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
MTIP_TRIM_TIMEOUT_MS) < 0)
ret = BLK_STS_IOERR;
dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
dma_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
return ret;
}
@ -1656,7 +1656,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
if (!user_buffer)
return -EFAULT;
buf = dmam_alloc_coherent(&port->dd->pdev->dev,
buf = dma_alloc_coherent(&port->dd->pdev->dev,
ATA_SECT_SIZE * xfer_sz,
&dma_addr,
GFP_KERNEL);
@ -1734,7 +1734,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
}
exit_drive_command:
if (buf)
dmam_free_coherent(&port->dd->pdev->dev,
dma_free_coherent(&port->dd->pdev->dev,
ATA_SECT_SIZE * xfer_sz, buf, dma_addr);
return rv;
}
@ -2838,11 +2838,11 @@ static void mtip_dma_free(struct driver_data *dd)
struct mtip_port *port = dd->port;
if (port->block1)
dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
port->block1, port->block1_dma);
if (port->command_list) {
dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
dma_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
port->command_list, port->command_list_dma);
}
}
@ -2861,7 +2861,7 @@ static int mtip_dma_alloc(struct driver_data *dd)
/* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
port->block1 =
dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
dma_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
&port->block1_dma, GFP_KERNEL);
if (!port->block1)
return -ENOMEM;
@ -2869,10 +2869,10 @@ static int mtip_dma_alloc(struct driver_data *dd)
/* Allocate dma memory for command list */
port->command_list =
dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
dma_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
&port->command_list_dma, GFP_KERNEL);
if (!port->command_list) {
dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
port->block1, port->block1_dma);
port->block1 = NULL;
port->block1_dma = 0;
@ -3057,13 +3057,8 @@ static int mtip_hw_init(struct driver_data *dd)
mtip_start_port(dd->port);
/* Setup the ISR and enable interrupts. */
rv = devm_request_irq(&dd->pdev->dev,
dd->pdev->irq,
mtip_irq_handler,
IRQF_SHARED,
dev_driver_string(&dd->pdev->dev),
dd);
rv = request_irq(dd->pdev->irq, mtip_irq_handler, IRQF_SHARED,
dev_driver_string(&dd->pdev->dev), dd);
if (rv) {
dev_err(&dd->pdev->dev,
"Unable to allocate IRQ %d\n", dd->pdev->irq);
@ -3091,7 +3086,7 @@ out3:
/* Release the IRQ. */
irq_set_affinity_hint(dd->pdev->irq, NULL);
devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
free_irq(dd->pdev->irq, dd);
out2:
mtip_deinit_port(dd->port);
@ -3146,7 +3141,7 @@ static int mtip_hw_exit(struct driver_data *dd)
/* Release the IRQ. */
irq_set_affinity_hint(dd->pdev->irq, NULL);
devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
free_irq(dd->pdev->irq, dd);
msleep(1000);
/* Free dma regions */
@ -3610,8 +3605,8 @@ static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq,
if (!cmd->command)
return;
dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
cmd->command, cmd->command_dma);
dma_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, cmd->command,
cmd->command_dma);
}
static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
@ -3620,7 +3615,7 @@ static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
struct driver_data *dd = set->driver_data;
struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
cmd->command = dma_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
&cmd->command_dma, GFP_KERNEL);
if (!cmd->command)
return -ENOMEM;

View File

@ -1571,7 +1571,7 @@ static int nbd_dev_add(int index)
nbd->tag_set.numa_node = NUMA_NO_NODE;
nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
BLK_MQ_F_BLOCKING;
nbd->tag_set.driver_data = nbd;
err = blk_mq_alloc_tag_set(&nbd->tag_set);
@ -2118,8 +2118,7 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
}
nla_nest_end(reply, dev_list);
genlmsg_end(reply, reply_head);
genlmsg_reply(reply, info);
ret = 0;
ret = genlmsg_reply(reply, info);
out:
mutex_unlock(&nbd_index_mutex);
return ret;

View File

@ -1104,7 +1104,7 @@ static int null_handle_bio(struct nullb_cmd *cmd)
len = bvec.bv_len;
err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
op_is_write(bio_op(bio)), sector,
bio_op(bio) & REQ_FUA);
bio->bi_opf & REQ_FUA);
if (err) {
spin_unlock_irq(&nullb->lock);
return err;
@ -1678,7 +1678,6 @@ static int null_add_dev(struct nullb_device *dev)
if (dev->cache_size > 0) {
set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
blk_queue_write_cache(nullb->q, true, true);
blk_queue_flush_queueable(nullb->q, true);
}
if (dev->zoned) {

View File

@ -3987,7 +3987,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->tag_set.ops = &rbd_mq_ops;
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
rbd_dev->tag_set.nr_hw_queues = 1;
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);

View File

@ -2843,7 +2843,6 @@ static int skd_cons_disk(struct skd_device *skdev)
skdev->sgs_per_request * sizeof(struct scatterlist);
skdev->tag_set.numa_node = NUMA_NO_NODE;
skdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
BLK_MQ_F_SG_MERGE |
BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO);
skdev->tag_set.driver_data = skdev;
rc = blk_mq_alloc_tag_set(&skdev->tag_set);

View File

@ -977,7 +977,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
} else
info->tag_set.queue_depth = BLK_RING_SIZE(info);
info->tag_set.numa_node = NUMA_NO_NODE;
info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
info->tag_set.cmd_size = sizeof(struct blkif_req);
info->tag_set.driver_data = info;

View File

@ -265,6 +265,7 @@
/* #define ERRLOGMASK (CD_WARNING|CD_OPEN|CD_COUNT_TRACKS|CD_CLOSE) */
/* #define ERRLOGMASK (CD_WARNING|CD_REG_UNREG|CD_DO_IOCTL|CD_OPEN|CD_CLOSE|CD_COUNT_TRACKS) */
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/major.h>
@ -3692,9 +3693,9 @@ static struct ctl_table_header *cdrom_sysctl_header;
static void cdrom_sysctl_register(void)
{
static int initialized;
static atomic_t initialized = ATOMIC_INIT(0);
if (initialized == 1)
if (!atomic_add_unless(&initialized, 1, 1))
return;
cdrom_sysctl_header = register_sysctl_table(cdrom_root_table);
@ -3705,8 +3706,6 @@ static void cdrom_sysctl_register(void)
cdrom_sysctl_settings.debug = debug;
cdrom_sysctl_settings.lock = lockdoor;
cdrom_sysctl_settings.check = check_media_type;
initialized = 1;
}
static void cdrom_sysctl_unregister(void)

View File

@ -141,7 +141,7 @@ struct nvm_chk_meta *pblk_get_chunk_meta(struct pblk *pblk)
ret = nvm_get_chunk_meta(dev, ppa, geo->all_chunks, meta);
if (ret) {
kfree(meta);
vfree(meta);
return ERR_PTR(-EIO);
}
@ -1065,7 +1065,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len);
smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC);
memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16);
guid_copy((guid_t *)&smeta_buf->header.uuid, &pblk->instance_uuid);
smeta_buf->header.id = cpu_to_le32(line->id);
smeta_buf->header.type = cpu_to_le16(line->type);
smeta_buf->header.version_major = SMETA_VERSION_MAJOR;
@ -1278,6 +1278,7 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
spin_unlock(&line->lock);
kref_init(&line->ref);
atomic_set(&line->sec_to_update, 0);
return 0;
}
@ -1874,7 +1875,8 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) {
emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC);
memcpy(emeta_buf->header.uuid, pblk->instance_uuid, 16);
guid_copy((guid_t *)&emeta_buf->header.uuid,
&pblk->instance_uuid);
emeta_buf->header.id = cpu_to_le32(line->id);
emeta_buf->header.type = cpu_to_le16(line->type);
emeta_buf->header.version_major = EMETA_VERSION_MAJOR;

View File

@ -365,16 +365,22 @@ static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk,
struct list_head *group_list)
{
struct pblk_line *line, *victim;
int line_vsc, victim_vsc;
unsigned int line_vsc = ~0x0L, victim_vsc = ~0x0L;
victim = list_first_entry(group_list, struct pblk_line, list);
list_for_each_entry(line, group_list, list) {
line_vsc = le32_to_cpu(*line->vsc);
victim_vsc = le32_to_cpu(*victim->vsc);
if (line_vsc < victim_vsc)
if (!atomic_read(&line->sec_to_update))
line_vsc = le32_to_cpu(*line->vsc);
if (line_vsc < victim_vsc) {
victim = line;
victim_vsc = le32_to_cpu(*victim->vsc);
}
}
if (victim_vsc == ~0x0)
return NULL;
return victim;
}
@ -448,13 +454,13 @@ next_gc_group:
do {
spin_lock(&l_mg->gc_lock);
if (list_empty(group_list)) {
line = pblk_gc_get_victim_line(pblk, group_list);
if (!line) {
spin_unlock(&l_mg->gc_lock);
break;
}
line = pblk_gc_get_victim_line(pblk, group_list);
spin_lock(&line->lock);
WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
line->state = PBLK_LINESTATE_GC;

View File

@ -130,7 +130,7 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
struct pblk_line *line = NULL;
if (factory_init) {
pblk_setup_uuid(pblk);
guid_gen(&pblk->instance_uuid);
} else {
line = pblk_recov_l2p(pblk);
if (IS_ERR(line)) {
@ -584,14 +584,12 @@ static void pblk_lines_free(struct pblk *pblk)
struct pblk_line *line;
int i;
spin_lock(&l_mg->free_lock);
for (i = 0; i < l_mg->nr_lines; i++) {
line = &pblk->lines[i];
pblk_line_free(line);
pblk_line_meta_free(l_mg, line);
}
spin_unlock(&l_mg->free_lock);
pblk_line_mg_free(pblk);

View File

@ -73,6 +73,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
*/
if (i < valid_secs) {
kref_get(&line->ref);
atomic_inc(&line->sec_to_update);
w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
w_ctx->ppa = ppa_list[i];
meta->lba = cpu_to_le64(w_ctx->lba);

View File

@ -45,10 +45,23 @@ void pblk_rb_free(struct pblk_rb *rb)
/*
* pblk_rb_calculate_size -- calculate the size of the write buffer
*/
static unsigned int pblk_rb_calculate_size(unsigned int nr_entries)
static unsigned int pblk_rb_calculate_size(unsigned int nr_entries,
unsigned int threshold)
{
/* Alloc a write buffer that can at least fit 128 entries */
return (1 << max(get_count_order(nr_entries), 7));
unsigned int thr_sz = 1 << (get_count_order(threshold + NVM_MAX_VLBA));
unsigned int max_sz = max(thr_sz, nr_entries);
unsigned int max_io;
/* Alloc a write buffer that can (i) fit at least two split bios
* (considering max I/O size NVM_MAX_VLBA, and (ii) guarantee that the
* threshold will be respected
*/
max_io = (1 << max((int)(get_count_order(max_sz)),
(int)(get_count_order(NVM_MAX_VLBA << 1))));
if ((threshold + NVM_MAX_VLBA) >= max_io)
max_io <<= 1;
return max_io;
}
/*
@ -67,12 +80,12 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
unsigned int alloc_order, order, iter;
unsigned int nr_entries;
nr_entries = pblk_rb_calculate_size(size);
nr_entries = pblk_rb_calculate_size(size, threshold);
entries = vzalloc(array_size(nr_entries, sizeof(struct pblk_rb_entry)));
if (!entries)
return -ENOMEM;
power_size = get_count_order(size);
power_size = get_count_order(nr_entries);
power_seg_sz = get_count_order(seg_size);
down_write(&pblk_rb_lock);
@ -149,7 +162,7 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
* Initialize rate-limiter, which controls access to the write buffer
* by user and GC I/O
*/
pblk_rl_init(&pblk->rl, rb->nr_entries);
pblk_rl_init(&pblk->rl, rb->nr_entries, threshold);
return 0;
}
@ -247,6 +260,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
entry->cacheline);
line = pblk_ppa_to_line(pblk, w_ctx->ppa);
atomic_dec(&line->sec_to_update);
kref_put(&line->ref, pblk_line_put);
clean_wctx(w_ctx);
rb->l2p_update = pblk_rb_ptr_wrap(rb, rb->l2p_update, 1);

View File

@ -302,35 +302,55 @@ static int pblk_pad_distance(struct pblk *pblk, struct pblk_line *line)
return (distance > line->left_msecs) ? line->left_msecs : distance;
}
static int pblk_line_wp_is_unbalanced(struct pblk *pblk,
struct pblk_line *line)
/* Return a chunk belonging to a line by stripe(write order) index */
static struct nvm_chk_meta *pblk_get_stripe_chunk(struct pblk *pblk,
struct pblk_line *line,
int index)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_lun *rlun;
struct nvm_chk_meta *chunk;
struct ppa_addr ppa;
u64 line_wp;
int pos, i;
int pos;
rlun = &pblk->luns[0];
rlun = &pblk->luns[index];
ppa = rlun->bppa;
pos = pblk_ppa_to_pos(geo, ppa);
chunk = &line->chks[pos];
line_wp = chunk->wp;
return &line->chks[pos];
}
for (i = 1; i < lm->blk_per_line; i++) {
rlun = &pblk->luns[i];
ppa = rlun->bppa;
pos = pblk_ppa_to_pos(geo, ppa);
chunk = &line->chks[pos];
static int pblk_line_wps_are_unbalanced(struct pblk *pblk,
struct pblk_line *line)
{
struct pblk_line_meta *lm = &pblk->lm;
int blk_in_line = lm->blk_per_line;
struct nvm_chk_meta *chunk;
u64 max_wp, min_wp;
int i;
if (chunk->wp > line_wp)
i = find_first_zero_bit(line->blk_bitmap, blk_in_line);
/* If there is one or zero good chunks in the line,
* the write pointers can't be unbalanced.
*/
if (i >= (blk_in_line - 1))
return 0;
chunk = pblk_get_stripe_chunk(pblk, line, i);
max_wp = chunk->wp;
if (max_wp > pblk->max_write_pgs)
min_wp = max_wp - pblk->max_write_pgs;
else
min_wp = 0;
i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1);
while (i < blk_in_line) {
chunk = pblk_get_stripe_chunk(pblk, line, i);
if (chunk->wp > max_wp || chunk->wp < min_wp)
return 1;
else if (chunk->wp < line_wp)
line_wp = chunk->wp;
i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1);
}
return 0;
@ -356,7 +376,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
int ret;
u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec;
if (pblk_line_wp_is_unbalanced(pblk, line))
if (pblk_line_wps_are_unbalanced(pblk, line))
pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id);
ppa_list = p.ppa_list;
@ -703,11 +723,13 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
/* The first valid instance uuid is used for initialization */
if (!valid_uuid) {
memcpy(pblk->instance_uuid, smeta_buf->header.uuid, 16);
guid_copy(&pblk->instance_uuid,
(guid_t *)&smeta_buf->header.uuid);
valid_uuid = 1;
}
if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) {
if (!guid_equal(&pblk->instance_uuid,
(guid_t *)&smeta_buf->header.uuid)) {
pblk_debug(pblk, "ignore line %u due to uuid mismatch\n",
i);
continue;
@ -737,7 +759,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
}
if (!found_lines) {
pblk_setup_uuid(pblk);
guid_gen(&pblk->instance_uuid);
spin_lock(&l_mg->free_lock);
WARN_ON_ONCE(!test_and_clear_bit(meta_line,

View File

@ -207,7 +207,7 @@ void pblk_rl_free(struct pblk_rl *rl)
del_timer(&rl->u_timer);
}
void pblk_rl_init(struct pblk_rl *rl, int budget)
void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold)
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
struct nvm_tgt_dev *dev = pblk->dev;
@ -217,7 +217,6 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
int sec_meta, blk_meta;
unsigned int rb_windows;
/* Consider sectors used for metadata */
sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
@ -234,7 +233,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
/* To start with, all buffer is available to user I/O writers */
rl->rb_budget = budget;
rl->rb_user_max = budget;
rl->rb_max_io = budget >> 1;
rl->rb_max_io = threshold ? (budget - threshold) : (budget - 1);
rl->rb_gc_max = 0;
rl->rb_state = PBLK_RL_HIGH;

View File

@ -139,7 +139,7 @@ TRACE_EVENT(pblk_state,
/* This part must be outside protection */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH ../../../drivers/lightnvm
#define TRACE_INCLUDE_PATH ../../drivers/lightnvm
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE pblk-trace
#include <trace/define_trace.h>

View File

@ -177,6 +177,7 @@ static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
* re-map these entries
*/
line = pblk_ppa_to_line(pblk, w_ctx->ppa);
atomic_dec(&line->sec_to_update);
kref_put(&line->ref, pblk_line_put);
}
spin_unlock(&pblk->trans_lock);

View File

@ -131,8 +131,8 @@ struct pblk_pr_ctx {
unsigned int bio_init_idx;
void *ppa_ptr;
dma_addr_t dma_ppa_list;
__le64 lba_list_mem[NVM_MAX_VLBA];
__le64 lba_list_media[NVM_MAX_VLBA];
u64 lba_list_mem[NVM_MAX_VLBA];
u64 lba_list_media[NVM_MAX_VLBA];
};
/* Pad context */
@ -487,6 +487,7 @@ struct pblk_line {
__le32 *vsc; /* Valid sector count in line */
struct kref ref; /* Write buffer L2P references */
atomic_t sec_to_update; /* Outstanding L2P updates to ppa */
struct pblk_w_err_gc *w_err_gc; /* Write error gc recovery metadata */
@ -646,7 +647,7 @@ struct pblk {
int sec_per_write;
unsigned char instance_uuid[16];
guid_t instance_uuid;
/* Persistent write amplification counters, 4kb sector I/Os */
atomic64_t user_wa; /* Sectors written by user */
@ -924,7 +925,7 @@ int pblk_gc_sysfs_force(struct pblk *pblk, int force);
/*
* pblk rate limiter
*/
void pblk_rl_init(struct pblk_rl *rl, int budget);
void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold);
void pblk_rl_free(struct pblk_rl *rl);
void pblk_rl_update_rates(struct pblk_rl *rl);
int pblk_rl_high_thrs(struct pblk_rl *rl);
@ -1360,14 +1361,6 @@ static inline unsigned int pblk_get_secs(struct bio *bio)
return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
}
static inline void pblk_setup_uuid(struct pblk *pblk)
{
uuid_le uuid;
uuid_le_gen(&uuid);
memcpy(pblk->instance_uuid, uuid.b, 16);
}
static inline char *pblk_disk_name(struct pblk *pblk)
{
struct gendisk *disk = pblk->disk;

View File

@ -432,8 +432,9 @@ static void do_btree_node_write(struct btree *b)
int j;
struct bio_vec *bv;
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, b->bio, j)
bio_for_each_segment_all(bv, b->bio, j, iter_all)
memcpy(page_address(bv->bv_page),
base + j * PAGE_SIZE, PAGE_SIZE);

View File

@ -538,6 +538,7 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
{
struct btree *b = container_of(bk, struct btree, keys);
unsigned int i, stale;
char buf[80];
if (!KEY_PTRS(k) ||
bch_extent_invalid(bk, k))
@ -547,19 +548,19 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
if (!ptr_available(b->c, k, i))
return true;
if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
return false;
for (i = 0; i < KEY_PTRS(k); i++) {
stale = ptr_stale(b->c, k, i);
if (stale && KEY_DIRTY(k)) {
bch_extent_to_text(buf, sizeof(buf), k);
pr_info("stale dirty pointer, stale %u, key: %s",
stale, buf);
}
btree_bug_on(stale > BUCKET_GC_GEN_MAX, b,
"key too stale: %i, need_gc %u",
stale, b->c->need_gc);
btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
b, "stale dirty pointer");
if (stale)
return true;

View File

@ -392,10 +392,11 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
/*
* Flag for bypass if the IO is for read-ahead or background,
* unless the read-ahead request is for metadata (eg, for gfs2).
* unless the read-ahead request is for metadata
* (eg, for gfs2 or xfs).
*/
if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
!(bio->bi_opf & REQ_PRIO))
!(bio->bi_opf & (REQ_META|REQ_PRIO)))
goto skip;
if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
@ -877,7 +878,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
}
if (!(bio->bi_opf & REQ_RAHEAD) &&
!(bio->bi_opf & REQ_PRIO) &&
!(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
reada = min_t(sector_t, dc->readahead >> 9,
get_capacity(bio->bi_disk) - bio_end_sector(bio));

View File

@ -111,7 +111,7 @@ void bch_cache_accounting_clear(struct cache_accounting *acc)
{
memset(&acc->total.cache_hits,
0,
sizeof(unsigned long) * 7);
sizeof(struct cache_stats));
}
void bch_cache_accounting_destroy(struct cache_accounting *acc)

View File

@ -1615,21 +1615,21 @@ static void conditional_stop_bcache_device(struct cache_set *c,
*/
pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
d->disk->disk_name);
/*
* There might be a small time gap that cache set is
* released but bcache device is not. Inside this time
* gap, regular I/O requests will directly go into
* backing device as no cache set attached to. This
* behavior may also introduce potential inconsistence
* data in writeback mode while cache is dirty.
* Therefore before calling bcache_device_stop() due
* to a broken cache device, dc->io_disable should be
* explicitly set to true.
*/
dc->io_disable = true;
/* make others know io_disable is true earlier */
smp_mb();
bcache_device_stop(d);
/*
* There might be a small time gap that cache set is
* released but bcache device is not. Inside this time
* gap, regular I/O requests will directly go into
* backing device as no cache set attached to. This
* behavior may also introduce potential inconsistence
* data in writeback mode while cache is dirty.
* Therefore before calling bcache_device_stop() due
* to a broken cache device, dc->io_disable should be
* explicitly set to true.
*/
dc->io_disable = true;
/* make others know io_disable is true earlier */
smp_mb();
bcache_device_stop(d);
} else {
/*
* dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO

View File

@ -67,6 +67,8 @@ read_attribute(written);
read_attribute(btree_written);
read_attribute(metadata_written);
read_attribute(active_journal_entries);
read_attribute(backing_dev_name);
read_attribute(backing_dev_uuid);
sysfs_time_stats_attribute(btree_gc, sec, ms);
sysfs_time_stats_attribute(btree_split, sec, us);
@ -243,6 +245,19 @@ SHOW(__bch_cached_dev)
return strlen(buf);
}
if (attr == &sysfs_backing_dev_name) {
snprintf(buf, BDEVNAME_SIZE + 1, "%s", dc->backing_dev_name);
strcat(buf, "\n");
return strlen(buf);
}
if (attr == &sysfs_backing_dev_uuid) {
/* convert binary uuid into 36-byte string plus '\0' */
snprintf(buf, 36+1, "%pU", dc->sb.uuid);
strcat(buf, "\n");
return strlen(buf);
}
#undef var
return 0;
}
@ -262,10 +277,10 @@ STORE(__cached_dev)
sysfs_strtoul(data_csum, dc->disk.data_csum);
d_strtoul(verify);
d_strtoul(bypass_torture_test);
d_strtoul(writeback_metadata);
d_strtoul(writeback_running);
d_strtoul(writeback_delay);
sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test);
sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata);
sysfs_strtoul_bool(writeback_running, dc->writeback_running);
sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX);
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
0, bch_cutoff_writeback);
@ -287,9 +302,15 @@ STORE(__cached_dev)
sysfs_strtoul_clamp(writeback_rate_update_seconds,
dc->writeback_rate_update_seconds,
1, WRITEBACK_RATE_UPDATE_SECS_MAX);
d_strtoul(writeback_rate_i_term_inverse);
d_strtoul_nonzero(writeback_rate_p_term_inverse);
d_strtoul_nonzero(writeback_rate_minimum);
sysfs_strtoul_clamp(writeback_rate_i_term_inverse,
dc->writeback_rate_i_term_inverse,
1, UINT_MAX);
sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
dc->writeback_rate_p_term_inverse,
1, UINT_MAX);
sysfs_strtoul_clamp(writeback_rate_minimum,
dc->writeback_rate_minimum,
1, UINT_MAX);
sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX);
@ -299,7 +320,9 @@ STORE(__cached_dev)
dc->io_disable = v ? 1 : 0;
}
d_strtoi_h(sequential_cutoff);
sysfs_strtoul_clamp(sequential_cutoff,
dc->sequential_cutoff,
0, UINT_MAX);
d_strtoi_h(readahead);
if (attr == &sysfs_clear_stats)
@ -452,6 +475,8 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_verify,
&sysfs_bypass_torture_test,
#endif
&sysfs_backing_dev_name,
&sysfs_backing_dev_uuid,
NULL
};
KTYPE(bch_cached_dev);
@ -761,10 +786,12 @@ STORE(__bch_cache_set)
c->shrink.scan_objects(&c->shrink, &sc);
}
sysfs_strtoul(congested_read_threshold_us,
c->congested_read_threshold_us);
sysfs_strtoul(congested_write_threshold_us,
c->congested_write_threshold_us);
sysfs_strtoul_clamp(congested_read_threshold_us,
c->congested_read_threshold_us,
0, UINT_MAX);
sysfs_strtoul_clamp(congested_write_threshold_us,
c->congested_write_threshold_us,
0, UINT_MAX);
if (attr == &sysfs_errors) {
v = __sysfs_match_string(error_actions, -1, buf);
@ -774,12 +801,20 @@ STORE(__bch_cache_set)
c->on_error = v;
}
if (attr == &sysfs_io_error_limit)
c->error_limit = strtoul_or_return(buf);
sysfs_strtoul_clamp(io_error_limit, c->error_limit, 0, UINT_MAX);
/* See count_io_errors() for why 88 */
if (attr == &sysfs_io_error_halflife)
c->error_decay = strtoul_or_return(buf) / 88;
if (attr == &sysfs_io_error_halflife) {
unsigned long v = 0;
ssize_t ret;
ret = strtoul_safe_clamp(buf, v, 0, UINT_MAX);
if (!ret) {
c->error_decay = v / 88;
return size;
}
return ret;
}
if (attr == &sysfs_io_disable) {
v = strtoul_or_return(buf);
@ -794,13 +829,15 @@ STORE(__bch_cache_set)
}
}
sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
sysfs_strtoul(verify, c->verify);
sysfs_strtoul(key_merging_disabled, c->key_merging_disabled);
sysfs_strtoul_clamp(journal_delay_ms,
c->journal_delay_ms,
0, USHRT_MAX);
sysfs_strtoul_bool(verify, c->verify);
sysfs_strtoul_bool(key_merging_disabled, c->key_merging_disabled);
sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks);
sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite);
sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled);
sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled);
/*
* write gc_after_writeback here may overwrite an already set
* BCH_DO_AUTO_GC, it doesn't matter because this flag will be

View File

@ -79,11 +79,28 @@ do { \
return strtoul_safe(buf, var) ?: (ssize_t) size; \
} while (0)
#define sysfs_strtoul_bool(file, var) \
do { \
if (attr == &sysfs_ ## file) { \
unsigned long v = strtoul_or_return(buf); \
\
var = v ? 1 : 0; \
return size; \
} \
} while (0)
#define sysfs_strtoul_clamp(file, var, min, max) \
do { \
if (attr == &sysfs_ ## file) \
return strtoul_safe_clamp(buf, var, min, max) \
?: (ssize_t) size; \
if (attr == &sysfs_ ## file) { \
unsigned long v = 0; \
ssize_t ret; \
ret = strtoul_safe_clamp(buf, v, min, max); \
if (!ret) { \
var = v; \
return size; \
} \
return ret; \
} \
} while (0)
#define strtoul_or_return(cp) \

View File

@ -270,7 +270,11 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
int i;
struct bio_vec *bv;
bio_for_each_segment_all(bv, bio, i) {
/*
* This is called on freshly new bio, so it is safe to access the
* bvec table directly.
*/
for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) {
bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec)

View File

@ -71,6 +71,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
in_use > bch_cutoff_writeback_sync)
return false;
if (bio_op(bio) == REQ_OP_DISCARD)
return false;
if (dc->partial_stripes_expensive &&
bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector,
bio_sectors(bio)))

View File

@ -1447,8 +1447,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
{
unsigned int i;
struct bio_vec *bv;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, clone, i) {
bio_for_each_segment_all(bv, clone, i, iter_all) {
BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, &cc->page_pool);
}

View File

@ -527,7 +527,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
md->tag_set->ops = &dm_mq_ops;
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
md->tag_set->numa_node = md->numa_node_id;
md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
md->tag_set->driver_data = md;

View File

@ -1698,14 +1698,6 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
return q && !blk_queue_add_random(q);
}
static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
}
static bool dm_table_all_devices_attribute(struct dm_table *t,
iterate_devices_callout_fn func)
{
@ -1902,11 +1894,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (!dm_table_supports_write_zeroes(t))
q->limits.max_write_zeroes_sectors = 0;
if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
blk_queue_flag_clear(QUEUE_FLAG_NO_SG_MERGE, q);
else
blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
dm_table_verify_integrity(t);
/*

View File

@ -96,8 +96,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
int i, cnt;
bool discard_supported = false;
conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
GFP_KERNEL);
conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
if (!conf)
return NULL;

View File

@ -1603,11 +1603,9 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
return;
}
set_bit(Blocked, &rdev->flags);
if (test_and_clear_bit(In_sync, &rdev->flags)) {
if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
set_bit(Faulty, &rdev->flags);
} else
set_bit(Faulty, &rdev->flags);
set_bit(Faulty, &rdev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
@ -2120,13 +2118,14 @@ static void process_checks(struct r1bio *r1_bio)
struct page **spages = get_resync_pages(sbio)->pages;
struct bio_vec *bi;
int page_len[RESYNC_PAGES] = { 0 };
struct bvec_iter_all iter_all;
if (sbio->bi_end_io != end_sync_read)
continue;
/* Now we can 'fixup' the error value */
sbio->bi_status = 0;
bio_for_each_segment_all(bi, sbio, j)
bio_for_each_segment_all(bi, sbio, j, iter_all)
page_len[j] = bi->bv_len;
if (!status) {

View File

@ -417,8 +417,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card)
else
mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
mq->tag_set.numa_node = NUMA_NO_NODE;
mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE |
BLK_MQ_F_BLOCKING;
mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
mq->tag_set.nr_hw_queues = 1;
mq->tag_set.cmd_size = sizeof(struct mmc_queue_req);
mq->tag_set.driver_data = mq;

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVM Express device driver
* Copyright (c) 2011-2014, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/blkdev.h>
@ -151,11 +143,8 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
static void nvme_delete_ctrl_work(struct work_struct *work)
static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
{
struct nvme_ctrl *ctrl =
container_of(work, struct nvme_ctrl, delete_work);
dev_info(ctrl->device,
"Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
@ -167,6 +156,14 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
nvme_put_ctrl(ctrl);
}
static void nvme_delete_ctrl_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl =
container_of(work, struct nvme_ctrl, delete_work);
nvme_do_delete_ctrl(ctrl);
}
int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
@ -177,7 +174,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
{
int ret = 0;
@ -186,13 +183,13 @@ int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
* can free the controller.
*/
nvme_get_ctrl(ctrl);
ret = nvme_delete_ctrl(ctrl);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
ret = -EBUSY;
if (!ret)
flush_work(&ctrl->delete_work);
nvme_do_delete_ctrl(ctrl);
nvme_put_ctrl(ctrl);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
{
@ -611,6 +608,22 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
return BLK_STS_OK;
}
static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd)
{
if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
return nvme_setup_discard(ns, req, cmnd);
cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->write_zeroes.slba =
cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
cmnd->write_zeroes.length =
cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
cmnd->write_zeroes.control = 0;
return BLK_STS_OK;
}
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd)
{
@ -705,7 +718,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
nvme_setup_flush(ns, cmd);
break;
case REQ_OP_WRITE_ZEROES:
/* currently only aliased to deallocate for a few ctrls: */
ret = nvme_setup_write_zeroes(ns, req, cmd);
break;
case REQ_OP_DISCARD:
ret = nvme_setup_discard(ns, req, cmd);
break;
@ -1512,6 +1526,37 @@ static void nvme_config_discard(struct nvme_ns *ns)
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
}
static inline void nvme_config_write_zeroes(struct nvme_ns *ns)
{
u32 max_sectors;
unsigned short bs = 1 << ns->lba_shift;
if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES))
return;
/*
* Even though NVMe spec explicitly states that MDTS is not
* applicable to the write-zeroes:- "The restriction does not apply to
* commands that do not transfer data between the host and the
* controller (e.g., Write Uncorrectable ro Write Zeroes command).".
* In order to be more cautious use controller's max_hw_sectors value
* to configure the maximum sectors for the write-zeroes which is
* configured based on the controller's MDTS field in the
* nvme_init_identify() if available.
*/
if (ns->ctrl->max_hw_sectors == UINT_MAX)
max_sectors = ((u32)(USHRT_MAX + 1) * bs) >> 9;
else
max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9;
blk_queue_max_write_zeroes_sectors(ns->queue, max_sectors);
}
static inline void nvme_ns_config_oncs(struct nvme_ns *ns)
{
nvme_config_discard(ns);
nvme_config_write_zeroes(ns);
}
static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
struct nvme_id_ns *id, struct nvme_ns_ids *ids)
{
@ -1565,7 +1610,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
capacity = 0;
set_capacity(disk, capacity);
nvme_config_discard(ns);
nvme_ns_config_oncs(ns);
if (id->nsattr & (1 << 0))
set_disk_ro(disk, true);
@ -2280,6 +2325,9 @@ static struct attribute *nvme_subsys_attrs[] = {
&subsys_attr_serial.attr,
&subsys_attr_firmware_rev.attr,
&subsys_attr_subsysnqn.attr,
#ifdef CONFIG_NVME_MULTIPATH
&subsys_attr_iopolicy.attr,
#endif
NULL,
};
@ -2332,6 +2380,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
subsys->vendor_id = le16_to_cpu(id->vid);
subsys->cmic = id->cmic;
#ifdef CONFIG_NVME_MULTIPATH
subsys->iopolicy = NVME_IOPOLICY_NUMA;
#endif
subsys->dev.class = nvme_subsys_class;
subsys->dev.release = nvme_release_subsystem;
@ -3163,21 +3214,23 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
return 0;
}
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
struct gendisk *disk;
struct nvme_id_ns *id;
char disk_name[DISK_NAME_LEN];
int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT;
int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
return;
return -ENOMEM;
ns->queue = blk_mq_init_queue(ctrl->tagset);
if (IS_ERR(ns->queue))
if (IS_ERR(ns->queue)) {
ret = PTR_ERR(ns->queue);
goto out_free_ns;
}
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
@ -3193,20 +3246,27 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
nvme_set_queue_limits(ctrl, ns->queue);
id = nvme_identify_ns(ctrl, nsid);
if (!id)
if (!id) {
ret = -EIO;
goto out_free_queue;
}
if (id->ncap == 0)
if (id->ncap == 0) {
ret = -EINVAL;
goto out_free_id;
}
if (nvme_init_ns_head(ns, nsid, id))
ret = nvme_init_ns_head(ns, nsid, id);
if (ret)
goto out_free_id;
nvme_setup_streams_ns(ctrl, ns);
nvme_set_disk_name(disk_name, ns, ctrl, &flags);
disk = alloc_disk_node(0, node);
if (!disk)
if (!disk) {
ret = -ENOMEM;
goto out_unlink_ns;
}
disk->fops = &nvme_fops;
disk->private_data = ns;
@ -3218,7 +3278,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
__nvme_revalidate_disk(disk, id);
if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
if (nvme_nvm_register(ns, disk_name, node)) {
ret = nvme_nvm_register(ns, disk_name, node);
if (ret) {
dev_warn(ctrl->device, "LightNVM init failure\n");
goto out_put_disk;
}
@ -3236,7 +3297,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
nvme_fault_inject_init(ns);
kfree(id);
return;
return 0;
out_put_disk:
put_disk(ns->disk);
out_unlink_ns:
@ -3249,6 +3310,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_cleanup_queue(ns->queue);
out_free_ns:
kfree(ns);
return ret;
}
static void nvme_ns_remove(struct nvme_ns *ns)
@ -3596,8 +3658,6 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fw_act_work);
if (ctrl->ops->stop_ctrl)
ctrl->ops->stop_ctrl(ctrl);
}
EXPORT_SYMBOL_GPL(nvme_stop_ctrl);

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVMe over Fabrics common host code.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
@ -430,6 +422,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue);
* @qid: NVMe I/O queue number for the new I/O connection between
* host and target (note qid == 0 is illegal as this is
* the Admin queue, per NVMe standard).
* @poll: Whether or not to poll for the completion of the connect cmd.
*
* This function issues a fabrics-protocol connection
* of a NVMe I/O queue (via NVMe Fabrics "Connect" command)

View File

@ -1,15 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* NVMe over Fabrics common host code.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef _NVME_FABRICS_H
#define _NVME_FABRICS_H 1

View File

@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
/*
* fault injection support for nvme.
*
* Copyright (c) 2018, Oracle and/or its affiliates
*
*/
#include <linux/moduleparam.h>

View File

@ -1,18 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2016 Avago Technologies. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful.
* ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
* INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
* THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
* See the GNU General Public License for more details, a copy of which
* can be found in the file COPYING included with this package
*
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>

View File

@ -1,23 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
/*
* nvme-lightnvm.c - LightNVM NVMe device
*
* Copyright (C) 2014-2015 IT University of Copenhagen
* Initial release: Matias Bjorling <mb@lightnvm.io>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
* USA.
*
*/
#include "nvme.h"

View File

@ -1,14 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2017-2018 Christoph Hellwig.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/moduleparam.h>
@ -141,7 +133,10 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue;
distance = node_distance(node, ns->ctrl->numa_node);
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
distance = node_distance(node, ns->ctrl->numa_node);
else
distance = LOCAL_DISTANCE;
switch (ns->ana_state) {
case NVME_ANA_OPTIMIZED:
@ -168,6 +163,47 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
return found;
}
static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
struct nvme_ns *ns)
{
ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
siblings);
if (ns)
return ns;
return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
}
static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
int node, struct nvme_ns *old)
{
struct nvme_ns *ns, *found, *fallback = NULL;
if (list_is_singular(&head->list))
return old;
for (ns = nvme_next_ns(head, old);
ns != old;
ns = nvme_next_ns(head, ns)) {
if (ns->ctrl->state != NVME_CTRL_LIVE ||
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue;
if (ns->ana_state == NVME_ANA_OPTIMIZED) {
found = ns;
goto out;
}
if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
fallback = ns;
}
if (!fallback)
return NULL;
found = fallback;
out:
rcu_assign_pointer(head->current_path[node], found);
return found;
}
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
{
return ns->ctrl->state == NVME_CTRL_LIVE &&
@ -180,6 +216,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
struct nvme_ns *ns;
ns = srcu_dereference(head->current_path[node], &head->srcu);
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
ns = nvme_round_robin_path(head, node, ns);
if (unlikely(!ns || !nvme_path_is_optimized(ns)))
ns = __nvme_find_path(head, node);
return ns;
@ -471,6 +509,44 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl)
cancel_work_sync(&ctrl->ana_work);
}
#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
struct device_attribute subsys_attr_##_name = \
__ATTR(_name, _mode, _show, _store)
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
};
static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvme_subsystem *subsys =
container_of(dev, struct nvme_subsystem, dev);
return sprintf(buf, "%s\n",
nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
}
static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct nvme_subsystem *subsys =
container_of(dev, struct nvme_subsystem, dev);
int i;
for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
WRITE_ONCE(subsys->iopolicy, i);
return count;
}
}
return -EINVAL;
}
SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{

View File

@ -1,14 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2011-2014, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef _NVME_H
@ -252,6 +244,11 @@ struct nvme_ctrl {
unsigned long discard_page_busy;
};
enum nvme_iopolicy {
NVME_IOPOLICY_NUMA,
NVME_IOPOLICY_RR,
};
struct nvme_subsystem {
int instance;
struct device dev;
@ -271,6 +268,9 @@ struct nvme_subsystem {
u8 cmic;
u16 vendor_id;
struct ida ns_ida;
#ifdef CONFIG_NVME_MULTIPATH
enum nvme_iopolicy iopolicy;
#endif
};
/*
@ -364,7 +364,6 @@ struct nvme_ctrl_ops {
void (*submit_async_event)(struct nvme_ctrl *ctrl);
void (*delete_ctrl)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
void (*stop_ctrl)(struct nvme_ctrl *ctrl);
};
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
@ -459,7 +458,6 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
void *log, size_t size, u64 offset);
@ -492,6 +490,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute subsys_attr_iopolicy;
#else
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVM Express device driver
* Copyright (c) 2011-2014, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/aer.h>
@ -157,6 +149,8 @@ static int queue_count_set(const char *val, const struct kernel_param *kp)
int n = 0, ret;
ret = kstrtoint(val, 10, &n);
if (ret)
return ret;
if (n > num_possible_cpus())
n = num_possible_cpus();

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVMe over Fabrics RDMA host code.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@ -942,14 +934,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
}
}
static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
cancel_work_sync(&ctrl->err_work);
cancel_delayed_work_sync(&ctrl->reconnect_work);
}
static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@ -1158,7 +1142,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
struct nvme_rdma_device *dev = queue->device;
struct ib_device *ibdev = dev->dev;
if (!blk_rq_payload_bytes(rq))
if (!blk_rq_nr_phys_segments(rq))
return;
if (req->mr) {
@ -1281,7 +1265,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
c->common.flags |= NVME_CMD_SGL_METABUF;
if (!blk_rq_payload_bytes(rq))
if (!blk_rq_nr_phys_segments(rq))
return nvme_rdma_set_sg_null(c);
req->sg_table.sgl = req->first_sgl;
@ -1854,6 +1838,9 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
{
cancel_work_sync(&ctrl->err_work);
cancel_delayed_work_sync(&ctrl->reconnect_work);
nvme_rdma_teardown_io_queues(ctrl, shutdown);
if (shutdown)
nvme_shutdown_ctrl(&ctrl->ctrl);
@ -1902,7 +1889,6 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.submit_async_event = nvme_rdma_submit_async_event,
.delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address,
.stop_ctrl = nvme_rdma_stop_ctrl,
};
/*

View File

@ -1822,6 +1822,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{
cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
nvme_tcp_teardown_io_queues(ctrl, shutdown);
if (shutdown)
nvme_shutdown_ctrl(ctrl);
@ -1859,12 +1862,6 @@ out_fail:
nvme_tcp_reconnect_or_remove(ctrl);
}
static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
{
cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
}
static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
@ -2115,7 +2112,6 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
.submit_async_event = nvme_tcp_submit_async_event,
.delete_ctrl = nvme_tcp_delete_ctrl,
.get_address = nvmf_get_address,
.stop_ctrl = nvme_tcp_stop_ctrl,
};
static bool

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVM Express device driver tracepoints
* Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <asm/unaligned.h>

View File

@ -1,15 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* NVM Express device driver tracepoints
* Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#undef TRACE_SYSTEM

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVMe admin command implementation.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Configfs interface for the NVMe target.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Common code for the NVMe target.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Discovery service for the NVMe over Fabrics target.
* Copyright (C) 2016 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/slab.h>
@ -331,7 +323,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
cmd->get_log_page.lid);
req->error_loc =
offsetof(struct nvme_get_log_page_command, lid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
case nvme_admin_identify:
req->data_len = NVME_IDENTIFY_DATA_SIZE;

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVMe Fabrics command implementation.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/blkdev.h>

View File

@ -1,18 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2016 Avago Technologies. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful.
* ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
* INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
* THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
* See the GNU General Public License for more details, a copy of which
* can be found in the file COPYING included with this package
*
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>

View File

@ -1,17 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2016 Avago Technologies. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful.
* ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
* INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
* THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
* See the GNU General Public License for more details, a copy of which
* can be found in the file COPYING included with this package
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVMe I/O command implementation.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/blkdev.h>

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVMe over Fabrics loopback device.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/scatterlist.h>

View File

@ -1,14 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef _NVMET_H

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVMe over Fabrics RDMA target.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/atomic.h>

View File

@ -1900,7 +1900,7 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
shost->tag_set.queue_depth = shost->can_queue;
shost->tag_set.cmd_size = cmd_size;
shost->tag_set.numa_node = NUMA_NO_NODE;
shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
shost->tag_set.flags |=
BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
shost->tag_set.driver_data = shost;

View File

@ -20,8 +20,9 @@ static inline void read_endio(struct bio *bio)
int i;
struct bio_vec *bvec;
const blk_status_t err = bio->bi_status;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
/* page is already locked */

View File

@ -849,8 +849,9 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
#ifdef EROFS_FS_HAS_MANAGED_CACHE
struct address_space *mc = NULL;
#endif
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
bool cachemngd = false;

View File

@ -211,6 +211,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
ssize_t ret;
blk_qc_t qc;
int i;
struct bvec_iter_all iter_all;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
@ -247,7 +248,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
task_io_account_write(ret);
}
if (iocb->ki_flags & IOCB_HIPRI)
bio.bi_opf |= REQ_HIPRI;
bio_set_polled(&bio, iocb);
qc = submit_bio(&bio);
for (;;) {
@ -260,7 +261,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
}
__set_current_state(TASK_RUNNING);
bio_for_each_segment_all(bvec, &bio, i) {
bio_for_each_segment_all(bvec, &bio, i, iter_all) {
if (should_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page);
@ -293,6 +294,14 @@ struct blkdev_dio {
static struct bio_set blkdev_dio_pool;
static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
{
struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
struct request_queue *q = bdev_get_queue(bdev);
return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
}
static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
@ -329,8 +338,9 @@ static void blkdev_bio_end_io(struct bio *bio)
} else {
struct bio_vec *bvec;
int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i)
bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page);
bio_put(bio);
}
@ -406,10 +416,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
if (!nr_pages) {
if (iocb->ki_flags & IOCB_HIPRI)
bio->bi_opf |= REQ_HIPRI;
bool polled = false;
if (iocb->ki_flags & IOCB_HIPRI) {
bio_set_polled(bio, iocb);
polled = true;
}
qc = submit_bio(bio);
if (polled)
WRITE_ONCE(iocb->ki_cookie, qc);
break;
}
@ -2076,6 +2093,7 @@ const struct file_operations def_blk_fops = {
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
.iopoll = blkdev_iopoll,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,

View File

@ -162,13 +162,14 @@ csum_failed:
} else {
int i;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
/*
* we have verified the checksum already, set page
* checked so the end_io handlers know about it
*/
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, cb->orig_bio, i)
bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all)
SetPageChecked(bvec->bv_page);
bio_endio(cb->orig_bio);

View File

@ -833,9 +833,10 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
struct bio_vec *bvec;
struct btrfs_root *root;
int i, ret = 0;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
if (ret)

View File

@ -152,11 +152,12 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
{
blk_status_t ret = 0;
struct bio_vec *bvec = bio_last_bvec_all(bio);
struct page *page = bvec->bv_page;
struct bio_vec bv;
struct extent_io_tree *tree = bio->bi_private;
u64 start;
start = page_offset(page) + bvec->bv_offset;
mp_bvec_last_segment(bvec, &bv);
start = page_offset(bv.bv_page) + bv.bv_offset;
bio->bi_private = NULL;
@ -2379,7 +2380,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
int read_mode = 0;
blk_status_t status;
int ret;
unsigned failed_bio_pages = bio_pages_all(failed_bio);
unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@ -2451,9 +2452,10 @@ static void end_bio_extent_writepage(struct bio *bio)
u64 start;
u64 end;
int i;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@ -2522,9 +2524,10 @@ static void end_bio_extent_readpage(struct bio *bio)
int mirror;
int ret;
int i;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@ -3641,9 +3644,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
struct bio_vec *bvec;
struct extent_buffer *eb;
int i, done;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
eb = (struct extent_buffer *)page->private;

View File

@ -7829,6 +7829,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
struct bio_vec *bvec;
struct extent_io_tree *io_tree, *failure_tree;
int i;
struct bvec_iter_all iter_all;
if (bio->bi_status)
goto end;
@ -7840,7 +7841,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
done->uptodate = 1;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i)
bio_for_each_segment_all(bvec, bio, i, iter_all)
clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
io_tree, done->start, bvec->bv_page,
btrfs_ino(BTRFS_I(inode)), 0);
@ -7919,6 +7920,7 @@ static void btrfs_retry_endio(struct bio *bio)
int uptodate;
int ret;
int i;
struct bvec_iter_all iter_all;
if (bio->bi_status)
goto end;
@ -7932,7 +7934,7 @@ static void btrfs_retry_endio(struct bio *bio)
failure_tree = &BTRFS_I(inode)->io_failure_tree;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
bvec->bv_offset, done->start,
bvec->bv_len);

View File

@ -1443,10 +1443,11 @@ static void set_bio_pages_uptodate(struct bio *bio)
{
struct bio_vec *bvec;
int i;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i)
bio_for_each_segment_all(bvec, bio, i, iter_all)
SetPageUptodate(bvec->bv_page);
}

View File

@ -3027,13 +3027,23 @@ void guard_bio_eod(int op, struct bio *bio)
/* Uhhuh. We've got a bio that straddles the device size! */
truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
/*
* The bio contains more than one segment which spans EOD, just return
* and let IO layer turn it into an EIO
*/
if (truncated_bytes > bvec->bv_len)
return;
/* Truncate the bio.. */
bio->bi_iter.bi_size -= truncated_bytes;
bvec->bv_len -= truncated_bytes;
/* ..and clear the end of the buffer for reads */
if (op == REQ_OP_READ) {
zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
struct bio_vec bv;
mp_bvec_last_segment(bvec, &bv);
zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
truncated_bytes);
}
}

View File

@ -30,8 +30,9 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
{
struct bio_vec *bv;
int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, bio, i) {
bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
int ret = fscrypt_decrypt_page(page->mapping->host, page,
PAGE_SIZE, 0, page->index);

View File

@ -551,7 +551,9 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
bio_check_pages_dirty(bio); /* transfers ownership */
} else {
bio_for_each_segment_all(bvec, bio, i) {
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
if (dio->op == REQ_OP_READ && !PageCompound(page) &&

View File

@ -420,8 +420,9 @@ static void _clear_bio(struct bio *bio)
{
struct bio_vec *bv;
unsigned i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, bio, i) {
bio_for_each_segment_all(bv, bio, i, iter_all) {
unsigned this_count = bv->bv_len;
if (likely(PAGE_SIZE == this_count))

View File

@ -468,11 +468,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
/* loop on all devices all pages */
for (d = 0; d < ios->numdevs; d++) {
struct bio *bio = ios->per_dev[d].bio;
struct bvec_iter_all iter_all;
if (!bio)
continue;
bio_for_each_segment_all(bv, bio, i) {
bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
SetPageUptodate(page);

View File

@ -63,8 +63,9 @@ static void ext4_finish_bio(struct bio *bio)
{
int i;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
#ifdef CONFIG_EXT4_FS_ENCRYPTION
struct page *data_page = NULL;

View File

@ -72,6 +72,7 @@ static void mpage_end_io(struct bio *bio)
{
struct bio_vec *bv;
int i;
struct bvec_iter_all iter_all;
if (ext4_bio_encrypted(bio)) {
if (bio->bi_status) {
@ -81,7 +82,7 @@ static void mpage_end_io(struct bio *bio)
return;
}
}
bio_for_each_segment_all(bv, bio, i) {
bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {

View File

@ -87,8 +87,9 @@ static void __read_end_io(struct bio *bio)
struct page *page;
struct bio_vec *bv;
int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, bio, i) {
bio_for_each_segment_all(bv, bio, i, iter_all) {
page = bv->bv_page;
/* PG_error was set if any post_read step failed */
@ -164,13 +165,14 @@ static void f2fs_write_end_io(struct bio *bio)
struct f2fs_sb_info *sbi = bio->bi_private;
struct bio_vec *bvec;
int i;
struct bvec_iter_all iter_all;
if (time_to_inject(sbi, FAULT_WRITE_IO)) {
f2fs_show_injection_info(FAULT_WRITE_IO);
bio->bi_status = BLK_STS_IOERR;
}
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
enum count_type type = WB_DATA_TYPE(page);
@ -347,6 +349,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
struct bio_vec *bvec;
struct page *target;
int i;
struct bvec_iter_all iter_all;
if (!io->bio)
return false;
@ -354,7 +357,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
if (!inode && !page && !ino)
return true;
bio_for_each_segment_all(bvec, io->bio, i) {
bio_for_each_segment_all(bvec, io->bio, i, iter_all) {
if (bvec->bv_page->mapping)
target = bvec->bv_page;

View File

@ -1280,6 +1280,7 @@ const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
@ -1310,6 +1311,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,

View File

@ -168,7 +168,8 @@ u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
* that is pinned in the pagecache.
*/
static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
struct bio_vec *bvec,
blk_status_t error)
{
struct buffer_head *bh, *next;
@ -207,6 +208,7 @@ static void gfs2_end_log_write(struct bio *bio)
struct bio_vec *bvec;
struct page *page;
int i;
struct bvec_iter_all iter_all;
if (bio->bi_status) {
fs_err(sdp, "Error %d writing to journal, jid=%u\n",
@ -214,7 +216,7 @@ static void gfs2_end_log_write(struct bio *bio)
wake_up(&sdp->sd_logd_waitq);
}
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
page = bvec->bv_page;
if (page_has_buffers(page))
gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);

View File

@ -190,8 +190,9 @@ static void gfs2_meta_read_endio(struct bio *bio)
{
struct bio_vec *bvec;
int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) {
bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct buffer_head *bh = page_buffers(page);
unsigned int len = bvec->bv_len;

View File

@ -274,8 +274,9 @@ iomap_read_end_io(struct bio *bio)
int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i)
bio_for_each_segment_all(bvec, bio, i, iter_all)
iomap_read_page_end_io(bvec, error);
bio_put(bio);
}
@ -324,7 +325,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
*/
sector = iomap_sector(iomap, pos);
if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
if (__bio_try_merge_page(ctx->bio, page, plen, poff))
if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
goto done;
is_contig = true;
}
@ -355,7 +356,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
ctx->bio->bi_end_io = iomap_read_end_io;
}
__bio_add_page(ctx->bio, page, plen, poff);
bio_add_page(ctx->bio, page, plen, poff);
done:
/*
* Move the caller beyond our range so that it keeps making progress.
@ -1463,6 +1464,28 @@ struct iomap_dio {
};
};
int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
{
struct request_queue *q = READ_ONCE(kiocb->private);
if (!q)
return 0;
return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
}
EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
struct bio *bio)
{
atomic_inc(&dio->ref);
if (dio->iocb->ki_flags & IOCB_HIPRI)
bio_set_polled(bio, dio->iocb);
dio->submit.last_queue = bdev_get_queue(iomap->bdev);
dio->submit.cookie = submit_bio(bio);
}
static ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
struct kiocb *iocb = dio->iocb;
@ -1568,14 +1591,15 @@ static void iomap_dio_bio_end_io(struct bio *bio)
} else {
struct bio_vec *bvec;
int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i)
bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page);
bio_put(bio);
}
}
static blk_qc_t
static void
iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
unsigned len)
{
@ -1589,15 +1613,10 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
if (dio->iocb->ki_flags & IOCB_HIPRI)
flags |= REQ_HIPRI;
get_page(page);
__bio_add_page(bio, page, len, 0);
bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
atomic_inc(&dio->ref);
return submit_bio(bio);
iomap_dio_submit_bio(dio, iomap, bio);
}
static loff_t
@ -1700,9 +1719,6 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
bio_set_pages_dirty(bio);
}
if (dio->iocb->ki_flags & IOCB_HIPRI)
bio->bi_opf |= REQ_HIPRI;
iov_iter_advance(dio->submit.iter, n);
dio->size += n;
@ -1710,11 +1726,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
copied += n;
nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
atomic_inc(&dio->ref);
dio->submit.last_queue = bdev_get_queue(iomap->bdev);
dio->submit.cookie = submit_bio(bio);
iomap_dio_submit_bio(dio, iomap, bio);
} while (nr_pages);
/*
@ -1925,6 +1937,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio->flags & IOMAP_DIO_WRITE_FUA)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
WRITE_ONCE(iocb->private, dio->submit.last_queue);
/*
* We are about to drop our additional submission reference, which
* might be the last reference to the dio. There are three three

View File

@ -48,8 +48,9 @@ static void mpage_end_io(struct bio *bio)
{
struct bio_vec *bv;
int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, bio, i) {
bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
page_endio(page, bio_op(bio),
blk_status_to_errno(bio->bi_status));

Some files were not shown because too many files have changed in this diff Show More