From f8ae6e3eb8251be32c6e913393d9f8d9e0609489 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 14 Jan 2011 08:41:02 +0100 Subject: [PATCH 1/3] block cfq: make queue preempt work for queues from different workload I got this: fio-874 [007] 2157.724514: 8,32 m N cfq874 preempt fio-874 [007] 2157.724519: 8,32 m N cfq830 slice expired t=1 fio-874 [007] 2157.724520: 8,32 m N cfq830 sl_used=1 disp=0 charge=1 iops=0 sect=0 fio-874 [007] 2157.724521: 8,32 m N cfq830 set_active wl_prio:0 wl_type:0 fio-874 [007] 2157.724522: 8,32 m N cfq830 Not idling. st->count:1 cfq830 is an async queue, and preempted by a sync queue cfq874. But since we have cfqg->saved_workload_slice mechanism, the preempt is a nop. Looks currently our preempt is totally broken if the two queues are not from the same workload type. Below patch fixes it. This will might make async queue starvation, but it's what our old code does before cgroup is added. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 8427697c5437..7bfea53c1bb5 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3284,9 +3284,18 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, */ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + struct cfq_queue *old_cfqq = cfqd->active_queue; + cfq_log_cfqq(cfqd, cfqq, "preempt"); cfq_slice_expired(cfqd, 1); + /* + * workload type is changed, don't save slice, otherwise preempt + * doesn't happen + */ + if (cfqq_type(old_cfqq) != cfqq_type(cfqq)) + cfqq->cfqg->saved_workload_slice = 0; + /* * Put the new queue at the front of the of the current list, * so we know that it will be selected next. From c553f8e335c00a7cff3ab3f13e793b13d3f2207f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 14 Jan 2011 08:41:03 +0100 Subject: [PATCH 2/3] block cfq: compensate preempted queue even if it has no slice assigned If a queue is preempted before it gets slice assigned, the queue doesn't get compensation, which looks unfair. For such queue, we compensate it for a whole slice. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7bfea53c1bb5..501ffdf0399c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -598,8 +598,8 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) return cfq_target_latency * cfqg->weight / st->total_weight; } -static inline void -cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static inline unsigned +cfq_scaled_group_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { unsigned slice = cfq_prio_to_slice(cfqd, cfqq); if (cfqd->cfq_latency) { @@ -625,6 +625,14 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) low_slice); } } + return slice; +} + +static inline void +cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + unsigned slice = cfq_scaled_group_slice(cfqd, cfqq); + cfqq->slice_start = jiffies; cfqq->slice_end = jiffies + slice; cfqq->allocated_slice = slice; @@ -1661,8 +1669,11 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, /* * store what was left of this slice, if the queue idled/timed out */ - if (timed_out && !cfq_cfqq_slice_new(cfqq)) { - cfqq->slice_resid = cfqq->slice_end - jiffies; + if (timed_out) { + if (cfq_cfqq_slice_new(cfqq)) + cfqq->slice_resid = cfq_scaled_group_slice(cfqd, cfqq); + else + cfqq->slice_resid = cfqq->slice_end - jiffies; cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); } From 49731baa41df404c2c3f44555869ab387363af43 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Jan 2011 18:43:57 +0100 Subject: [PATCH 3/3] block: restore multiple bd_link_disk_holder() support Commit e09b457b (block: simplify holder symlink handling) incorrectly assumed that there is only one link at maximum. dm may use multiple links and expects block layer to track reference count for each link, which is different from and unrelated to the exclusive device holder identified by @holder when the device is opened. Remove the single holder assumption and automatic removal of the link and revive the per-link reference count tracking. The code essentially behaves the same as before commit e09b457b sans the unnecessary kobject reference count dancing. While at it, note that this facility should not be used by anyone else than the current ones. Sysfs symlinks shouldn't be abused like this and the whole thing doesn't belong in the block layer at all. Signed-off-by: Tejun Heo Reported-by: Milan Broz Cc: Jun'ichi Nomura Cc: Neil Brown Cc: linux-raid@vger.kernel.org Cc: Kay Sievers Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 1 + drivers/md/md.c | 1 + fs/block_dev.c | 97 ++++++++++++++++++++++++++++++++++--------- include/linux/fs.h | 8 +++- 4 files changed, 86 insertions(+), 21 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index dffa0ac7c4f0..38e4eb1bb965 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -350,6 +350,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) if (!d->dm_dev.bdev) return; + bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md)); blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL); d->dm_dev.bdev = NULL; } diff --git a/drivers/md/md.c b/drivers/md/md.c index cf8594c5ea21..b76cfc89e1b5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1912,6 +1912,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) MD_BUG(); return; } + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; diff --git a/fs/block_dev.c b/fs/block_dev.c index fe3f59c14a02..333a7bb4cb9c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -432,6 +432,9 @@ static void init_once(void *foo) mutex_init(&bdev->bd_mutex); INIT_LIST_HEAD(&bdev->bd_inodes); INIT_LIST_HEAD(&bdev->bd_list); +#ifdef CONFIG_SYSFS + INIT_LIST_HEAD(&bdev->bd_holder_disks); +#endif inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); @@ -779,6 +782,23 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, } #ifdef CONFIG_SYSFS +struct bd_holder_disk { + struct list_head list; + struct gendisk *disk; + int refcnt; +}; + +static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, + struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + list_for_each_entry(holder, &bdev->bd_holder_disks, list) + if (holder->disk == disk) + return holder; + return NULL; +} + static int add_symlink(struct kobject *from, struct kobject *to) { return sysfs_create_link(from, to, kobject_name(to)); @@ -794,6 +814,8 @@ static void del_symlink(struct kobject *from, struct kobject *to) * @bdev: the claimed slave bdev * @disk: the holding disk * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * * This functions creates the following sysfs symlinks. * * - from "slaves" directory of the holder @disk to the claimed @bdev @@ -817,47 +839,83 @@ static void del_symlink(struct kobject *from, struct kobject *to) */ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { + struct bd_holder_disk *holder; int ret = 0; mutex_lock(&bdev->bd_mutex); - WARN_ON_ONCE(!bdev->bd_holder || bdev->bd_holder_disk); + WARN_ON_ONCE(!bdev->bd_holder); /* FIXME: remove the following once add_disk() handles errors */ if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) goto out_unlock; - ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); - if (ret) - goto out_unlock; - - ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); - if (ret) { - del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + holder = bd_find_holder_disk(bdev, disk); + if (holder) { + holder->refcnt++; goto out_unlock; } - bdev->bd_holder_disk = disk; + holder = kzalloc(sizeof(*holder), GFP_KERNEL); + if (!holder) { + ret = -ENOMEM; + goto out_unlock; + } + + INIT_LIST_HEAD(&holder->list); + holder->disk = disk; + holder->refcnt = 1; + + ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + if (ret) + goto out_free; + + ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + goto out_del; + + list_add(&holder->list, &bdev->bd_holder_disks); + goto out_unlock; + +out_del: + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); +out_free: + kfree(holder); out_unlock: mutex_unlock(&bdev->bd_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); -static void bd_unlink_disk_holder(struct block_device *bdev) +/** + * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() + * @bdev: the calimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * CONTEXT: + * Might sleep. + */ +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { - struct gendisk *disk = bdev->bd_holder_disk; + struct bd_holder_disk *holder; - bdev->bd_holder_disk = NULL; - if (!disk) - return; + mutex_lock(&bdev->bd_mutex); - del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); - del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + holder = bd_find_holder_disk(bdev, disk); + + if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + del_symlink(bdev->bd_part->holder_dir, + &disk_to_dev(disk)->kobj); + list_del_init(&holder->list); + kfree(holder); + } + + mutex_unlock(&bdev->bd_mutex); } -#else -static inline void bd_unlink_disk_holder(struct block_device *bdev) -{ } +EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif /** @@ -1380,7 +1438,6 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) * unblock evpoll if it was a write holder. */ if (bdev_free) { - bd_unlink_disk_holder(bdev); if (bdev->bd_write_holder) { disk_unblock_events(bdev->bd_disk); bdev->bd_write_holder = false; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3984f2358d1f..fb2190349cdf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -666,7 +666,7 @@ struct block_device { int bd_holders; bool bd_write_holder; #ifdef CONFIG_SYSFS - struct gendisk * bd_holder_disk; /* for sysfs slave linkng */ + struct list_head bd_holder_disks; #endif struct block_device * bd_contains; unsigned bd_block_size; @@ -2058,12 +2058,18 @@ extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, extern int blkdev_put(struct block_device *bdev, fmode_t mode); #ifdef CONFIG_SYSFS extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); +extern void bd_unlink_disk_holder(struct block_device *bdev, + struct gendisk *disk); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { return 0; } +static inline void bd_unlink_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ +} #endif #endif