mirror of
https://github.com/torvalds/linux.git
synced 2024-11-10 22:21:40 +00:00
Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block-related fixes from Jens Axboe: - Improvements to the buffered and direct write IO plugging from Fengguang. - Abstract out the mapping of a bio in a request, and use that to provide a blk_bio_map_sg() helper. Useful for mapping just a bio instead of a full request. - Regression fix from Hugh, fixing up a patch that went into the previous release cycle (and marked stable, too) attempting to prevent a loop in __getblk_slow(). - Updates to discard requests, fixing up the sizing and how we align them. Also a change to disallow merging of discard requests, since that doesn't really work properly yet. - A few drbd fixes. - Documentation updates. * 'for-linus' of git://git.kernel.dk/linux-block: block: replace __getblk_slow misfix by grow_dev_page fix drbd: Write all pages of the bitmap after an online resize drbd: Finish requests that completed while IO was frozen drbd: fix drbd wire compatibility for empty flushes Documentation: update tunable options in block/cfq-iosched.txt Documentation: update tunable options in block/cfq-iosched.txt Documentation: update missing index files in block/00-INDEX block: move down direct IO plugging block: remove plugging at buffered write time block: disable discard request merge temporarily bio: Fix potential memory leak in bio_find_or_create_slab() block: Don't use static to define "void *p" in show_partition_start() block: Add blk_bio_map_sg() helper block: Introduce __blk_segment_map_sg() helper fs/block-dev.c:fix performance regression in O_DIRECT writes to md block devices block: split discard into aligned requests block: reorganize rounding of max_discard_sectors
This commit is contained in:
commit
a7e546f175
@ -3,15 +3,21 @@
|
||||
biodoc.txt
|
||||
- Notes on the Generic Block Layer Rewrite in Linux 2.5
|
||||
capability.txt
|
||||
- Generic Block Device Capability (/sys/block/<disk>/capability)
|
||||
- Generic Block Device Capability (/sys/block/<device>/capability)
|
||||
cfq-iosched.txt
|
||||
- CFQ IO scheduler tunables
|
||||
data-integrity.txt
|
||||
- Block data integrity
|
||||
deadline-iosched.txt
|
||||
- Deadline IO scheduler tunables
|
||||
ioprio.txt
|
||||
- Block io priorities (in CFQ scheduler)
|
||||
queue-sysfs.txt
|
||||
- Queue's sysfs entries
|
||||
request.txt
|
||||
- The members of struct request (in include/linux/blkdev.h)
|
||||
stat.txt
|
||||
- Block layer statistics in /sys/block/<dev>/stat
|
||||
- Block layer statistics in /sys/block/<device>/stat
|
||||
switching-sched.txt
|
||||
- Switching I/O schedulers at runtime
|
||||
writeback_cache_control.txt
|
||||
|
@ -1,3 +1,14 @@
|
||||
CFQ (Complete Fairness Queueing)
|
||||
===============================
|
||||
|
||||
The main aim of CFQ scheduler is to provide a fair allocation of the disk
|
||||
I/O bandwidth for all the processes which requests an I/O operation.
|
||||
|
||||
CFQ maintains the per process queue for the processes which request I/O
|
||||
operation(syncronous requests). In case of asynchronous requests, all the
|
||||
requests from all the processes are batched together according to their
|
||||
process's I/O priority.
|
||||
|
||||
CFQ ioscheduler tunables
|
||||
========================
|
||||
|
||||
@ -25,6 +36,72 @@ there are multiple spindles behind single LUN (Host based hardware RAID
|
||||
controller or for storage arrays), setting slice_idle=0 might end up in better
|
||||
throughput and acceptable latencies.
|
||||
|
||||
back_seek_max
|
||||
-------------
|
||||
This specifies, given in Kbytes, the maximum "distance" for backward seeking.
|
||||
The distance is the amount of space from the current head location to the
|
||||
sectors that are backward in terms of distance.
|
||||
|
||||
This parameter allows the scheduler to anticipate requests in the "backward"
|
||||
direction and consider them as being the "next" if they are within this
|
||||
distance from the current head location.
|
||||
|
||||
back_seek_penalty
|
||||
-----------------
|
||||
This parameter is used to compute the cost of backward seeking. If the
|
||||
backward distance of request is just 1/back_seek_penalty from a "front"
|
||||
request, then the seeking cost of two requests is considered equivalent.
|
||||
|
||||
So scheduler will not bias toward one or the other request (otherwise scheduler
|
||||
will bias toward front request). Default value of back_seek_penalty is 2.
|
||||
|
||||
fifo_expire_async
|
||||
-----------------
|
||||
This parameter is used to set the timeout of asynchronous requests. Default
|
||||
value of this is 248ms.
|
||||
|
||||
fifo_expire_sync
|
||||
----------------
|
||||
This parameter is used to set the timeout of synchronous requests. Default
|
||||
value of this is 124ms. In case to favor synchronous requests over asynchronous
|
||||
one, this value should be decreased relative to fifo_expire_async.
|
||||
|
||||
slice_async
|
||||
-----------
|
||||
This parameter is same as of slice_sync but for asynchronous queue. The
|
||||
default value is 40ms.
|
||||
|
||||
slice_async_rq
|
||||
--------------
|
||||
This parameter is used to limit the dispatching of asynchronous request to
|
||||
device request queue in queue's slice time. The maximum number of request that
|
||||
are allowed to be dispatched also depends upon the io priority. Default value
|
||||
for this is 2.
|
||||
|
||||
slice_sync
|
||||
----------
|
||||
When a queue is selected for execution, the queues IO requests are only
|
||||
executed for a certain amount of time(time_slice) before switching to another
|
||||
queue. This parameter is used to calculate the time slice of synchronous
|
||||
queue.
|
||||
|
||||
time_slice is computed using the below equation:-
|
||||
time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the
|
||||
time_slice of synchronous queue, increase the value of slice_sync. Default
|
||||
value is 100ms.
|
||||
|
||||
quantum
|
||||
-------
|
||||
This specifies the number of request dispatched to the device queue. In a
|
||||
queue's time slice, a request will not be dispatched if the number of request
|
||||
in the device exceeds this parameter. This parameter is used for synchronous
|
||||
request.
|
||||
|
||||
In case of storage with several disk, this setting can limit the parallel
|
||||
processing of request. Therefore, increasing the value can imporve the
|
||||
performace although this can cause the latency of some I/O to increase due
|
||||
to more number of requests.
|
||||
|
||||
CFQ IOPS Mode for group scheduling
|
||||
===================================
|
||||
Basic CFQ design is to provide priority based time slices. Higher priority
|
||||
|
@ -9,20 +9,71 @@ These files are the ones found in the /sys/block/xxx/queue/ directory.
|
||||
Files denoted with a RO postfix are readonly and the RW postfix means
|
||||
read-write.
|
||||
|
||||
add_random (RW)
|
||||
----------------
|
||||
This file allows to trun off the disk entropy contribution. Default
|
||||
value of this file is '1'(on).
|
||||
|
||||
discard_granularity (RO)
|
||||
-----------------------
|
||||
This shows the size of internal allocation of the device in bytes, if
|
||||
reported by the device. A value of '0' means device does not support
|
||||
the discard functionality.
|
||||
|
||||
discard_max_bytes (RO)
|
||||
----------------------
|
||||
Devices that support discard functionality may have internal limits on
|
||||
the number of bytes that can be trimmed or unmapped in a single operation.
|
||||
The discard_max_bytes parameter is set by the device driver to the maximum
|
||||
number of bytes that can be discarded in a single operation. Discard
|
||||
requests issued to the device must not exceed this limit. A discard_max_bytes
|
||||
value of 0 means that the device does not support discard functionality.
|
||||
|
||||
discard_zeroes_data (RO)
|
||||
------------------------
|
||||
When read, this file will show if the discarded block are zeroed by the
|
||||
device or not. If its value is '1' the blocks are zeroed otherwise not.
|
||||
|
||||
hw_sector_size (RO)
|
||||
-------------------
|
||||
This is the hardware sector size of the device, in bytes.
|
||||
|
||||
iostats (RW)
|
||||
-------------
|
||||
This file is used to control (on/off) the iostats accounting of the
|
||||
disk.
|
||||
|
||||
logical_block_size (RO)
|
||||
-----------------------
|
||||
This is the logcal block size of the device, in bytes.
|
||||
|
||||
max_hw_sectors_kb (RO)
|
||||
----------------------
|
||||
This is the maximum number of kilobytes supported in a single data transfer.
|
||||
|
||||
max_integrity_segments (RO)
|
||||
---------------------------
|
||||
When read, this file shows the max limit of integrity segments as
|
||||
set by block layer which a hardware controller can handle.
|
||||
|
||||
max_sectors_kb (RW)
|
||||
-------------------
|
||||
This is the maximum number of kilobytes that the block layer will allow
|
||||
for a filesystem request. Must be smaller than or equal to the maximum
|
||||
size allowed by the hardware.
|
||||
|
||||
max_segments (RO)
|
||||
-----------------
|
||||
Maximum number of segments of the device.
|
||||
|
||||
max_segment_size (RO)
|
||||
---------------------
|
||||
Maximum segment size of the device.
|
||||
|
||||
minimum_io_size (RO)
|
||||
--------------------
|
||||
This is the smallest preferred io size reported by the device.
|
||||
|
||||
nomerges (RW)
|
||||
-------------
|
||||
This enables the user to disable the lookup logic involved with IO
|
||||
@ -45,11 +96,24 @@ per-block-cgroup request pool. IOW, if there are N block cgroups,
|
||||
each request queue may have upto N request pools, each independently
|
||||
regulated by nr_requests.
|
||||
|
||||
optimal_io_size (RO)
|
||||
--------------------
|
||||
This is the optimal io size reported by the device.
|
||||
|
||||
physical_block_size (RO)
|
||||
------------------------
|
||||
This is the physical block size of device, in bytes.
|
||||
|
||||
read_ahead_kb (RW)
|
||||
------------------
|
||||
Maximum number of kilobytes to read-ahead for filesystems on this block
|
||||
device.
|
||||
|
||||
rotational (RW)
|
||||
---------------
|
||||
This file is used to stat if the device is of rotational type or
|
||||
non-rotational type.
|
||||
|
||||
rq_affinity (RW)
|
||||
----------------
|
||||
If this option is '1', the block layer will migrate request completions to the
|
||||
|
@ -44,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
|
||||
struct request_queue *q = bdev_get_queue(bdev);
|
||||
int type = REQ_WRITE | REQ_DISCARD;
|
||||
unsigned int max_discard_sectors;
|
||||
unsigned int granularity, alignment, mask;
|
||||
struct bio_batch bb;
|
||||
struct bio *bio;
|
||||
int ret = 0;
|
||||
@ -54,18 +55,20 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
|
||||
if (!blk_queue_discard(q))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/* Zero-sector (unknown) and one-sector granularities are the same. */
|
||||
granularity = max(q->limits.discard_granularity >> 9, 1U);
|
||||
mask = granularity - 1;
|
||||
alignment = (bdev_discard_alignment(bdev) >> 9) & mask;
|
||||
|
||||
/*
|
||||
* Ensure that max_discard_sectors is of the proper
|
||||
* granularity
|
||||
* granularity, so that requests stay aligned after a split.
|
||||
*/
|
||||
max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
|
||||
max_discard_sectors = round_down(max_discard_sectors, granularity);
|
||||
if (unlikely(!max_discard_sectors)) {
|
||||
/* Avoid infinite loop below. Being cautious never hurts. */
|
||||
return -EOPNOTSUPP;
|
||||
} else if (q->limits.discard_granularity) {
|
||||
unsigned int disc_sects = q->limits.discard_granularity >> 9;
|
||||
|
||||
max_discard_sectors &= ~(disc_sects - 1);
|
||||
}
|
||||
|
||||
if (flags & BLKDEV_DISCARD_SECURE) {
|
||||
@ -79,25 +82,37 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
|
||||
bb.wait = &wait;
|
||||
|
||||
while (nr_sects) {
|
||||
unsigned int req_sects;
|
||||
sector_t end_sect;
|
||||
|
||||
bio = bio_alloc(gfp_mask, 1);
|
||||
if (!bio) {
|
||||
ret = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
|
||||
|
||||
/*
|
||||
* If splitting a request, and the next starting sector would be
|
||||
* misaligned, stop the discard at the previous aligned sector.
|
||||
*/
|
||||
end_sect = sector + req_sects;
|
||||
if (req_sects < nr_sects && (end_sect & mask) != alignment) {
|
||||
end_sect =
|
||||
round_down(end_sect - alignment, granularity)
|
||||
+ alignment;
|
||||
req_sects = end_sect - sector;
|
||||
}
|
||||
|
||||
bio->bi_sector = sector;
|
||||
bio->bi_end_io = bio_batch_end_io;
|
||||
bio->bi_bdev = bdev;
|
||||
bio->bi_private = &bb;
|
||||
|
||||
if (nr_sects > max_discard_sectors) {
|
||||
bio->bi_size = max_discard_sectors << 9;
|
||||
nr_sects -= max_discard_sectors;
|
||||
sector += max_discard_sectors;
|
||||
} else {
|
||||
bio->bi_size = nr_sects << 9;
|
||||
nr_sects = 0;
|
||||
}
|
||||
bio->bi_size = req_sects << 9;
|
||||
nr_sects -= req_sects;
|
||||
sector = end_sect;
|
||||
|
||||
atomic_inc(&bb.done);
|
||||
submit_bio(type, bio);
|
||||
|
@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
|
||||
struct scatterlist *sglist, struct bio_vec **bvprv,
|
||||
struct scatterlist **sg, int *nsegs, int *cluster)
|
||||
{
|
||||
|
||||
int nbytes = bvec->bv_len;
|
||||
|
||||
if (*bvprv && *cluster) {
|
||||
if ((*sg)->length + nbytes > queue_max_segment_size(q))
|
||||
goto new_segment;
|
||||
|
||||
if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
|
||||
goto new_segment;
|
||||
if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
|
||||
goto new_segment;
|
||||
|
||||
(*sg)->length += nbytes;
|
||||
} else {
|
||||
new_segment:
|
||||
if (!*sg)
|
||||
*sg = sglist;
|
||||
else {
|
||||
/*
|
||||
* If the driver previously mapped a shorter
|
||||
* list, we could see a termination bit
|
||||
* prematurely unless it fully inits the sg
|
||||
* table on each mapping. We KNOW that there
|
||||
* must be more entries here or the driver
|
||||
* would be buggy, so force clear the
|
||||
* termination bit to avoid doing a full
|
||||
* sg_init_table() in drivers for each command.
|
||||
*/
|
||||
(*sg)->page_link &= ~0x02;
|
||||
*sg = sg_next(*sg);
|
||||
}
|
||||
|
||||
sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
|
||||
(*nsegs)++;
|
||||
}
|
||||
*bvprv = bvec;
|
||||
}
|
||||
|
||||
/*
|
||||
* map a request to scatterlist, return number of sg entries setup. Caller
|
||||
* must make sure sg can hold rq->nr_phys_segments entries
|
||||
@ -131,41 +174,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
|
||||
bvprv = NULL;
|
||||
sg = NULL;
|
||||
rq_for_each_segment(bvec, rq, iter) {
|
||||
int nbytes = bvec->bv_len;
|
||||
|
||||
if (bvprv && cluster) {
|
||||
if (sg->length + nbytes > queue_max_segment_size(q))
|
||||
goto new_segment;
|
||||
|
||||
if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
|
||||
goto new_segment;
|
||||
if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
|
||||
goto new_segment;
|
||||
|
||||
sg->length += nbytes;
|
||||
} else {
|
||||
new_segment:
|
||||
if (!sg)
|
||||
sg = sglist;
|
||||
else {
|
||||
/*
|
||||
* If the driver previously mapped a shorter
|
||||
* list, we could see a termination bit
|
||||
* prematurely unless it fully inits the sg
|
||||
* table on each mapping. We KNOW that there
|
||||
* must be more entries here or the driver
|
||||
* would be buggy, so force clear the
|
||||
* termination bit to avoid doing a full
|
||||
* sg_init_table() in drivers for each command.
|
||||
*/
|
||||
sg->page_link &= ~0x02;
|
||||
sg = sg_next(sg);
|
||||
}
|
||||
|
||||
sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
|
||||
nsegs++;
|
||||
}
|
||||
bvprv = bvec;
|
||||
__blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
|
||||
&nsegs, &cluster);
|
||||
} /* segments in rq */
|
||||
|
||||
|
||||
@ -199,6 +209,43 @@ new_segment:
|
||||
}
|
||||
EXPORT_SYMBOL(blk_rq_map_sg);
|
||||
|
||||
/**
|
||||
* blk_bio_map_sg - map a bio to a scatterlist
|
||||
* @q: request_queue in question
|
||||
* @bio: bio being mapped
|
||||
* @sglist: scatterlist being mapped
|
||||
*
|
||||
* Note:
|
||||
* Caller must make sure sg can hold bio->bi_phys_segments entries
|
||||
*
|
||||
* Will return the number of sg entries setup
|
||||
*/
|
||||
int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
|
||||
struct scatterlist *sglist)
|
||||
{
|
||||
struct bio_vec *bvec, *bvprv;
|
||||
struct scatterlist *sg;
|
||||
int nsegs, cluster;
|
||||
unsigned long i;
|
||||
|
||||
nsegs = 0;
|
||||
cluster = blk_queue_cluster(q);
|
||||
|
||||
bvprv = NULL;
|
||||
sg = NULL;
|
||||
bio_for_each_segment(bvec, bio, i) {
|
||||
__blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
|
||||
&nsegs, &cluster);
|
||||
} /* segments in bio */
|
||||
|
||||
if (sg)
|
||||
sg_mark_end(sg);
|
||||
|
||||
BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
|
||||
return nsegs;
|
||||
}
|
||||
EXPORT_SYMBOL(blk_bio_map_sg);
|
||||
|
||||
static inline int ll_new_hw_segment(struct request_queue *q,
|
||||
struct request *req,
|
||||
struct bio *bio)
|
||||
|
@ -835,7 +835,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v)
|
||||
|
||||
static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
|
||||
{
|
||||
static void *p;
|
||||
void *p;
|
||||
|
||||
p = disk_seqf_start(seqf, pos);
|
||||
if (!IS_ERR_OR_NULL(p) && !*pos)
|
||||
|
@ -889,6 +889,7 @@ struct bm_aio_ctx {
|
||||
unsigned int done;
|
||||
unsigned flags;
|
||||
#define BM_AIO_COPY_PAGES 1
|
||||
#define BM_WRITE_ALL_PAGES 2
|
||||
int error;
|
||||
struct kref kref;
|
||||
};
|
||||
@ -1059,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
|
||||
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
|
||||
break;
|
||||
if (rw & WRITE) {
|
||||
if (bm_test_page_unchanged(b->bm_pages[i])) {
|
||||
if (!(flags & BM_WRITE_ALL_PAGES) &&
|
||||
bm_test_page_unchanged(b->bm_pages[i])) {
|
||||
dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
|
||||
continue;
|
||||
}
|
||||
@ -1140,6 +1142,17 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
|
||||
return bm_rw(mdev, WRITE, 0, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* drbd_bm_write_all() - Write the whole bitmap to its on disk location.
|
||||
* @mdev: DRBD device.
|
||||
*
|
||||
* Will write all pages.
|
||||
*/
|
||||
int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local)
|
||||
{
|
||||
return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
|
||||
* @mdev: DRBD device.
|
||||
|
@ -1469,6 +1469,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
|
||||
extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
|
||||
extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
|
||||
extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
|
||||
extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
|
||||
extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
|
||||
extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
|
||||
unsigned long al_enr);
|
||||
|
@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
|
||||
static void md_sync_timer_fn(unsigned long data);
|
||||
static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
|
||||
static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
|
||||
static void _tl_clear(struct drbd_conf *mdev);
|
||||
|
||||
MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
|
||||
"Lars Ellenberg <lars@linbit.com>");
|
||||
@ -432,19 +433,10 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
|
||||
|
||||
/* Actions operating on the disk state, also want to work on
|
||||
requests that got barrier acked. */
|
||||
switch (what) {
|
||||
case fail_frozen_disk_io:
|
||||
case restart_frozen_disk_io:
|
||||
list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
|
||||
req = list_entry(le, struct drbd_request, tl_requests);
|
||||
_req_mod(req, what);
|
||||
}
|
||||
|
||||
case connection_lost_while_pending:
|
||||
case resend:
|
||||
break;
|
||||
default:
|
||||
dev_err(DEV, "what = %d in _tl_restart()\n", what);
|
||||
list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
|
||||
req = list_entry(le, struct drbd_request, tl_requests);
|
||||
_req_mod(req, what);
|
||||
}
|
||||
}
|
||||
|
||||
@ -458,12 +450,17 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
|
||||
* receiver thread and the worker thread.
|
||||
*/
|
||||
void tl_clear(struct drbd_conf *mdev)
|
||||
{
|
||||
spin_lock_irq(&mdev->req_lock);
|
||||
_tl_clear(mdev);
|
||||
spin_unlock_irq(&mdev->req_lock);
|
||||
}
|
||||
|
||||
static void _tl_clear(struct drbd_conf *mdev)
|
||||
{
|
||||
struct list_head *le, *tle;
|
||||
struct drbd_request *r;
|
||||
|
||||
spin_lock_irq(&mdev->req_lock);
|
||||
|
||||
_tl_restart(mdev, connection_lost_while_pending);
|
||||
|
||||
/* we expect this list to be empty. */
|
||||
@ -482,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev)
|
||||
|
||||
memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
|
||||
|
||||
spin_unlock_irq(&mdev->req_lock);
|
||||
}
|
||||
|
||||
void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
|
||||
@ -1476,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
|
||||
if (ns.susp_fen) {
|
||||
/* case1: The outdate peer handler is successful: */
|
||||
if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
|
||||
tl_clear(mdev);
|
||||
if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
|
||||
drbd_uuid_new_current(mdev);
|
||||
clear_bit(NEW_CUR_UUID, &mdev->flags);
|
||||
}
|
||||
spin_lock_irq(&mdev->req_lock);
|
||||
_tl_clear(mdev);
|
||||
_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
|
||||
spin_unlock_irq(&mdev->req_lock);
|
||||
}
|
||||
|
@ -674,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
|
||||
la_size_changed && md_moved ? "size changed and md moved" :
|
||||
la_size_changed ? "size changed" : "md moved");
|
||||
/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
|
||||
err = drbd_bitmap_io(mdev, &drbd_bm_write,
|
||||
"size changed", BM_LOCKED_MASK);
|
||||
err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
|
||||
"size changed", BM_LOCKED_MASK);
|
||||
if (err) {
|
||||
rv = dev_size_error;
|
||||
goto out;
|
||||
|
@ -695,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
|
||||
break;
|
||||
|
||||
case resend:
|
||||
/* Simply complete (local only) READs. */
|
||||
if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
|
||||
_req_may_be_done(req, m);
|
||||
break;
|
||||
}
|
||||
|
||||
/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
|
||||
before the connection loss (B&C only); only P_BARRIER_ACK was missing.
|
||||
Trowing them out of the TL here by pretending we got a BARRIER_ACK
|
||||
@ -834,7 +840,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
|
||||
req->private_bio = NULL;
|
||||
}
|
||||
if (rw == WRITE) {
|
||||
remote = 1;
|
||||
/* Need to replicate writes. Unless it is an empty flush,
|
||||
* which is better mapped to a DRBD P_BARRIER packet,
|
||||
* also for drbd wire protocol compatibility reasons. */
|
||||
if (unlikely(size == 0)) {
|
||||
/* The only size==0 bios we expect are empty flushes. */
|
||||
D_ASSERT(bio->bi_rw & REQ_FLUSH);
|
||||
remote = 0;
|
||||
} else
|
||||
remote = 1;
|
||||
} else {
|
||||
/* READ || READA */
|
||||
if (local) {
|
||||
@ -870,8 +884,11 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
|
||||
* extent. This waits for any resync activity in the corresponding
|
||||
* resync extent to finish, and, if necessary, pulls in the target
|
||||
* extent into the activity log, which involves further disk io because
|
||||
* of transactional on-disk meta data updates. */
|
||||
if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
|
||||
* of transactional on-disk meta data updates.
|
||||
* Empty flushes don't need to go into the activity log, they can only
|
||||
* flush data for pending writes which are already in there. */
|
||||
if (rw == WRITE && local && size
|
||||
&& !test_bit(AL_SUSPENDED, &mdev->flags)) {
|
||||
req->rq_state |= RQ_IN_ACT_LOG;
|
||||
drbd_al_begin_io(mdev, sector);
|
||||
}
|
||||
@ -994,7 +1011,10 @@ allocate_barrier:
|
||||
if (rw == WRITE && _req_conflicts(req))
|
||||
goto fail_conflicting;
|
||||
|
||||
list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
|
||||
/* no point in adding empty flushes to the transfer log,
|
||||
* they are mapped to drbd barriers already. */
|
||||
if (likely(size!=0))
|
||||
list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
|
||||
|
||||
/* NOTE remote first: to get the concurrent write detection right,
|
||||
* we must register the request before start of local IO. */
|
||||
@ -1014,6 +1034,14 @@ allocate_barrier:
|
||||
mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
|
||||
maybe_pull_ahead(mdev);
|
||||
|
||||
/* If this was a flush, queue a drbd barrier/start a new epoch.
|
||||
* Unless the current epoch was empty anyways, or we are not currently
|
||||
* replicating, in which case there is no point. */
|
||||
if (unlikely(bio->bi_rw & REQ_FLUSH)
|
||||
&& mdev->newest_tle->n_writes
|
||||
&& drbd_should_do_remote(mdev->state))
|
||||
queue_barrier(mdev);
|
||||
|
||||
spin_unlock_irq(&mdev->req_lock);
|
||||
kfree(b); /* if someone else has beaten us to it... */
|
||||
|
||||
|
11
fs/bio.c
11
fs/bio.c
@ -73,7 +73,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
|
||||
{
|
||||
unsigned int sz = sizeof(struct bio) + extra_size;
|
||||
struct kmem_cache *slab = NULL;
|
||||
struct bio_slab *bslab;
|
||||
struct bio_slab *bslab, *new_bio_slabs;
|
||||
unsigned int i, entry = -1;
|
||||
|
||||
mutex_lock(&bio_slab_lock);
|
||||
@ -97,11 +97,12 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
|
||||
|
||||
if (bio_slab_nr == bio_slab_max && entry == -1) {
|
||||
bio_slab_max <<= 1;
|
||||
bio_slabs = krealloc(bio_slabs,
|
||||
bio_slab_max * sizeof(struct bio_slab),
|
||||
GFP_KERNEL);
|
||||
if (!bio_slabs)
|
||||
new_bio_slabs = krealloc(bio_slabs,
|
||||
bio_slab_max * sizeof(struct bio_slab),
|
||||
GFP_KERNEL);
|
||||
if (!new_bio_slabs)
|
||||
goto out_unlock;
|
||||
bio_slabs = new_bio_slabs;
|
||||
}
|
||||
if (entry == -1)
|
||||
entry = bio_slab_nr++;
|
||||
|
@ -1578,10 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
unsigned long nr_segs, loff_t pos)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct blk_plug plug;
|
||||
ssize_t ret;
|
||||
|
||||
BUG_ON(iocb->ki_pos != pos);
|
||||
|
||||
blk_start_plug(&plug);
|
||||
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
|
||||
if (ret > 0 || ret == -EIOCBQUEUED) {
|
||||
ssize_t err;
|
||||
@ -1590,6 +1592,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
if (err < 0 && ret > 0)
|
||||
ret = err;
|
||||
}
|
||||
blk_finish_plug(&plug);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blkdev_aio_write);
|
||||
|
66
fs/buffer.c
66
fs/buffer.c
@ -914,7 +914,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
|
||||
/*
|
||||
* Initialise the state of a blockdev page's buffers.
|
||||
*/
|
||||
static void
|
||||
static sector_t
|
||||
init_page_buffers(struct page *page, struct block_device *bdev,
|
||||
sector_t block, int size)
|
||||
{
|
||||
@ -936,33 +936,41 @@ init_page_buffers(struct page *page, struct block_device *bdev,
|
||||
block++;
|
||||
bh = bh->b_this_page;
|
||||
} while (bh != head);
|
||||
|
||||
/*
|
||||
* Caller needs to validate requested block against end of device.
|
||||
*/
|
||||
return end_block;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create the page-cache page that contains the requested block.
|
||||
*
|
||||
* This is user purely for blockdev mappings.
|
||||
* This is used purely for blockdev mappings.
|
||||
*/
|
||||
static struct page *
|
||||
static int
|
||||
grow_dev_page(struct block_device *bdev, sector_t block,
|
||||
pgoff_t index, int size)
|
||||
pgoff_t index, int size, int sizebits)
|
||||
{
|
||||
struct inode *inode = bdev->bd_inode;
|
||||
struct page *page;
|
||||
struct buffer_head *bh;
|
||||
sector_t end_block;
|
||||
int ret = 0; /* Will call free_more_memory() */
|
||||
|
||||
page = find_or_create_page(inode->i_mapping, index,
|
||||
(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
|
||||
if (!page)
|
||||
return NULL;
|
||||
return ret;
|
||||
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
if (page_has_buffers(page)) {
|
||||
bh = page_buffers(page);
|
||||
if (bh->b_size == size) {
|
||||
init_page_buffers(page, bdev, block, size);
|
||||
return page;
|
||||
end_block = init_page_buffers(page, bdev,
|
||||
index << sizebits, size);
|
||||
goto done;
|
||||
}
|
||||
if (!try_to_free_buffers(page))
|
||||
goto failed;
|
||||
@ -982,14 +990,14 @@ grow_dev_page(struct block_device *bdev, sector_t block,
|
||||
*/
|
||||
spin_lock(&inode->i_mapping->private_lock);
|
||||
link_dev_buffers(page, bh);
|
||||
init_page_buffers(page, bdev, block, size);
|
||||
end_block = init_page_buffers(page, bdev, index << sizebits, size);
|
||||
spin_unlock(&inode->i_mapping->private_lock);
|
||||
return page;
|
||||
|
||||
done:
|
||||
ret = (block < end_block) ? 1 : -ENXIO;
|
||||
failed:
|
||||
unlock_page(page);
|
||||
page_cache_release(page);
|
||||
return NULL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -999,7 +1007,6 @@ failed:
|
||||
static int
|
||||
grow_buffers(struct block_device *bdev, sector_t block, int size)
|
||||
{
|
||||
struct page *page;
|
||||
pgoff_t index;
|
||||
int sizebits;
|
||||
|
||||
@ -1023,22 +1030,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
|
||||
bdevname(bdev, b));
|
||||
return -EIO;
|
||||
}
|
||||
block = index << sizebits;
|
||||
|
||||
/* Create a page with the proper size buffers.. */
|
||||
page = grow_dev_page(bdev, block, index, size);
|
||||
if (!page)
|
||||
return 0;
|
||||
unlock_page(page);
|
||||
page_cache_release(page);
|
||||
return 1;
|
||||
return grow_dev_page(bdev, block, index, size, sizebits);
|
||||
}
|
||||
|
||||
static struct buffer_head *
|
||||
__getblk_slow(struct block_device *bdev, sector_t block, int size)
|
||||
{
|
||||
int ret;
|
||||
struct buffer_head *bh;
|
||||
|
||||
/* Size must be multiple of hard sectorsize */
|
||||
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
|
||||
(size < 512 || size > PAGE_SIZE))) {
|
||||
@ -1051,21 +1050,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
retry:
|
||||
bh = __find_get_block(bdev, block, size);
|
||||
if (bh)
|
||||
return bh;
|
||||
for (;;) {
|
||||
struct buffer_head *bh;
|
||||
int ret;
|
||||
|
||||
ret = grow_buffers(bdev, block, size);
|
||||
if (ret == 0) {
|
||||
free_more_memory();
|
||||
goto retry;
|
||||
} else if (ret > 0) {
|
||||
bh = __find_get_block(bdev, block, size);
|
||||
if (bh)
|
||||
return bh;
|
||||
|
||||
ret = grow_buffers(bdev, block, size);
|
||||
if (ret < 0)
|
||||
return NULL;
|
||||
if (ret == 0)
|
||||
free_more_memory();
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1321,10 +1319,6 @@ EXPORT_SYMBOL(__find_get_block);
|
||||
* which corresponds to the passed block_device, block and size. The
|
||||
* returned buffer has its reference count incremented.
|
||||
*
|
||||
* __getblk() cannot fail - it just keeps trying. If you pass it an
|
||||
* illegal block number, __getblk() will happily return a buffer_head
|
||||
* which represents the non-existent block. Very weird.
|
||||
*
|
||||
* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
|
||||
* attempt is failing. FIXME, perhaps?
|
||||
*/
|
||||
|
@ -1062,6 +1062,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
|
||||
unsigned long user_addr;
|
||||
size_t bytes;
|
||||
struct buffer_head map_bh = { 0, };
|
||||
struct blk_plug plug;
|
||||
|
||||
if (rw & WRITE)
|
||||
rw = WRITE_ODIRECT;
|
||||
@ -1177,6 +1178,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
|
||||
PAGE_SIZE - user_addr / PAGE_SIZE);
|
||||
}
|
||||
|
||||
blk_start_plug(&plug);
|
||||
|
||||
for (seg = 0; seg < nr_segs; seg++) {
|
||||
user_addr = (unsigned long)iov[seg].iov_base;
|
||||
sdio.size += bytes = iov[seg].iov_len;
|
||||
@ -1235,6 +1238,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
|
||||
if (sdio.bio)
|
||||
dio_bio_submit(dio, &sdio);
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
/*
|
||||
* It is possible that, we return short IO due to end of file.
|
||||
* In that case, we need to release all the pages we got hold on.
|
||||
|
@ -601,7 +601,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
|
||||
* it already be started by driver.
|
||||
*/
|
||||
#define RQ_NOMERGE_FLAGS \
|
||||
(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
|
||||
(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD)
|
||||
#define rq_mergeable(rq) \
|
||||
(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
|
||||
(((rq)->cmd_flags & REQ_DISCARD) || \
|
||||
@ -894,6 +894,8 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
|
||||
extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
|
||||
|
||||
extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
|
||||
extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
|
||||
struct scatterlist *sglist);
|
||||
extern void blk_dump_rq_flags(struct request *, char *);
|
||||
extern long nr_blockdev_pages(void);
|
||||
|
||||
@ -1139,6 +1141,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector
|
||||
& (lim->discard_granularity - 1);
|
||||
}
|
||||
|
||||
static inline int bdev_discard_alignment(struct block_device *bdev)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(bdev);
|
||||
|
||||
if (bdev != bdev->bd_contains)
|
||||
return bdev->bd_part->discard_alignment;
|
||||
|
||||
return q->limits.discard_alignment;
|
||||
}
|
||||
|
||||
static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
|
||||
{
|
||||
if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
|
||||
|
@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
|
||||
retval = filemap_write_and_wait_range(mapping, pos,
|
||||
pos + iov_length(iov, nr_segs) - 1);
|
||||
if (!retval) {
|
||||
struct blk_plug plug;
|
||||
|
||||
blk_start_plug(&plug);
|
||||
retval = mapping->a_ops->direct_IO(READ, iocb,
|
||||
iov, pos, nr_segs);
|
||||
blk_finish_plug(&plug);
|
||||
}
|
||||
if (retval > 0) {
|
||||
*ppos = pos + retval;
|
||||
@ -2527,14 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file->f_mapping->host;
|
||||
struct blk_plug plug;
|
||||
ssize_t ret;
|
||||
|
||||
BUG_ON(iocb->ki_pos != pos);
|
||||
|
||||
sb_start_write(inode->i_sb);
|
||||
mutex_lock(&inode->i_mutex);
|
||||
blk_start_plug(&plug);
|
||||
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
|
||||
@ -2545,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
if (err < 0 && ret > 0)
|
||||
ret = err;
|
||||
}
|
||||
blk_finish_plug(&plug);
|
||||
sb_end_write(inode->i_sb);
|
||||
return ret;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user