From 54514a70adefe356afe854e2d3912d46668068e6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 23 Sep 2008 22:15:57 -0700 Subject: [PATCH 01/12] softirq: Add support for triggering softirq work on softirqs. This is basically a genericization of Jens Axboe's block layer remote softirq changes. Signed-off-by: David S. Miller Signed-off-by: Jens Axboe --- include/linux/interrupt.h | 21 +++++++ include/linux/smp.h | 4 +- kernel/softirq.c | 129 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 1 deletion(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 54b3623434ec..35a61dc60d51 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -273,6 +275,25 @@ extern void softirq_init(void); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); +/* This is the worklist that queues up per-cpu softirq work. + * + * send_remote_sendirq() adds work to these lists, and + * the softirq handler itself dequeues from them. The queues + * are protected by disabling local cpu interrupts and they must + * only be accessed by the local cpu that they are for. + */ +DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); + +/* Try to send a softirq to a remote cpu. If this cannot be done, the + * work will be queued to the local cpu. + */ +extern void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq); + +/* Like send_remote_softirq(), but the caller must disable local cpu interrupts + * and compute the current cpu, passed in as 'this_cpu'. + */ +extern void __send_remote_softirq(struct call_single_data *cp, int cpu, + int this_cpu, int softirq); /* Tasklets --- multithreaded analogue of BHs. diff --git a/include/linux/smp.h b/include/linux/smp.h index 66484d4a8459..2e4d58b26c06 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -7,6 +7,7 @@ */ #include +#include #include #include @@ -16,7 +17,8 @@ struct call_single_data { struct list_head list; void (*func) (void *info); void *info; - unsigned int flags; + u16 flags; + u16 priv; }; #ifdef CONFIG_SMP diff --git a/kernel/softirq.c b/kernel/softirq.c index 37d67aa2d56f..83ba21a13bd4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,6 +6,8 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Remote softirq infrastructure is by Jens Axboe. */ #include @@ -474,17 +476,144 @@ void tasklet_kill(struct tasklet_struct *t) EXPORT_SYMBOL(tasklet_kill); +DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +EXPORT_PER_CPU_SYMBOL(softirq_work_list); + +static void __local_trigger(struct call_single_data *cp, int softirq) +{ + struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); + + list_add_tail(&cp->list, head); + + /* Trigger the softirq only if the list was previously empty. */ + if (head->next == &cp->list) + raise_softirq_irqoff(softirq); +} + +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS +static void remote_softirq_receive(void *data) +{ + struct call_single_data *cp = data; + unsigned long flags; + int softirq; + + softirq = cp->priv; + + local_irq_save(flags); + __local_trigger(cp, softirq); + local_irq_restore(flags); +} + +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + if (cpu_online(cpu)) { + cp->func = remote_softirq_receive; + cp->info = cp; + cp->flags = 0; + cp->priv = softirq; + + __smp_call_function_single(cpu, cp); + return 0; + } + return 1; +} +#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + return 1; +} +#endif + +/** + * __send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @this_cpu: the currently executing cpu + * @softirq: the softirq for the work + * + * Attempt to schedule softirq work on a remote cpu. If this cannot be + * done, the work is instead queued up on the local cpu. + * + * Interrupts must be disabled. + */ +void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) +{ + if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) + __local_trigger(cp, softirq); +} +EXPORT_SYMBOL(__send_remote_softirq); + +/** + * send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @softirq: the softirq for the work + * + * Like __send_remote_softirq except that disabling interrupts and + * computing the current cpu is done for the caller. + */ +void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + unsigned long flags; + int this_cpu; + + local_irq_save(flags); + this_cpu = smp_processor_id(); + __send_remote_softirq(cp, cpu, this_cpu, softirq); + local_irq_restore(flags); +} +EXPORT_SYMBOL(send_remote_softirq); + +static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + int i; + + local_irq_disable(); + for (i = 0; i < NR_SOFTIRQS; i++) { + struct list_head *head = &per_cpu(softirq_work_list[i], cpu); + struct list_head *local_head; + + if (list_empty(head)) + continue; + + local_head = &__get_cpu_var(softirq_work_list[i]); + list_splice_init(head, local_head); + raise_softirq_irqoff(i); + } + local_irq_enable(); + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { + .notifier_call = remote_softirq_cpu_notify, +}; + void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { + int i; + per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; + for (i = 0; i < NR_SOFTIRQS; i++) + INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } + register_hotcpu_notifier(&remote_softirq_cpu_notifier); + open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } From 0e11e342bac31f51ea055b147d20f77b59b9dd2c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Oct 2008 10:46:01 +0200 Subject: [PATCH 02/12] block: add BIG FAT WARNING to CONFIG_DEBUG_BLOCK_EXT_DEVT CONFIG_DEBUG_BLOCK_EXT_DEVT can break booting even on some modern distros. Add BIG FAT WARNING to keep people at a safe distance. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- lib/Kconfig.debug | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 31d784dd80d0..b0f239e443bc 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -652,6 +652,11 @@ config DEBUG_BLOCK_EXT_DEVT depends on BLOCK default n help + BIG FAT WARNING: ENABLING THIS OPTION MIGHT BREAK BOOTING ON + SOME DISTRIBUTIONS. DO NOT ENABLE THIS UNLESS YOU KNOW WHAT + YOU ARE DOING. Distros, please enable this and fix whatever + is broken. + Conventionally, block device numbers are allocated from predetermined contiguous area. However, extended block area may introduce non-contiguous block device numbers. This From 0fc71e3d6520ba7abad5cfbc9a33db0190e4d5b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Oct 2008 13:27:59 +0200 Subject: [PATCH 03/12] block: add partition attribute for partition number With extended devt, finding out the partition number becomes a bit more challenging as subtracting the minor number from that of the parent device doesn't work anymore. The only thing left is parsing the partition name which is brittle and not exactly universal (some have '-' between the device name and partition number while others don't). This patch introduced partition attribute which contains the partition number of the device. This should make finding partitions and its index easier. This problem and solution were suggested by H. Peter Anvin. Signed-off-by: Tejun Heo Cc: H. Peter Anvin Signed-off-by: Jens Axboe --- fs/partitions/check.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/partitions/check.c b/fs/partitions/check.c index fbeb2f372a93..cfb0c80690aa 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -195,6 +195,14 @@ check_partition(struct gendisk *hd, struct block_device *bdev) return ERR_PTR(res); } +static ssize_t part_partition_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->partno); +} + static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -260,6 +268,7 @@ ssize_t part_fail_store(struct device *dev, } #endif +static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); @@ -269,6 +278,7 @@ static struct device_attribute dev_attr_fail = #endif static struct attribute *part_attrs[] = { + &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_stat.attr, From 8677142710516d986d932d6f1fba7be8382c1fec Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 13 Oct 2008 14:19:05 +0200 Subject: [PATCH 04/12] block: fix nr_phys_segments miscalculation bug This fixes the bug reported by Nikanth Karthikesan : http://lkml.org/lkml/2008/10/2/203 The root cause of the bug is that blk_phys_contig_segment miscalculates q->max_segment_size. blk_phys_contig_segment checks: req->biotail->bi_size + next_req->bio->bi_size > q->max_segment_size But blk_recalc_rq_segments might expect that req->biotail and the previous bio in the req are supposed be merged into one segment. blk_recalc_rq_segments might also expect that next_req->bio and the next bio in the next_req are supposed be merged into one segment. In such case, we merge two requests that can't be merged here. Later, blk_rq_map_sg gives more segments than it should. We need to keep track of segment size in blk_recalc_rq_segments and use it to see if two requests can be merged. This patch implements it in the similar way that we used to do for hw merging (virtual merging). Signed-off-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- block/blk-merge.c | 20 ++++++++++++++++++-- include/linux/bio.h | 7 +++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 908d3e11ac52..8681cd6f9911 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -77,12 +77,20 @@ void blk_recalc_rq_segments(struct request *rq) continue; } new_segment: + if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size) + rq->bio->bi_seg_front_size = seg_size; + nr_phys_segs++; bvprv = bv; seg_size = bv->bv_len; highprv = high; } + if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size) + rq->bio->bi_seg_front_size = seg_size; + if (seg_size > rq->biotail->bi_seg_back_size) + rq->biotail->bi_seg_back_size = seg_size; + rq->nr_phys_segments = nr_phys_segs; } @@ -106,7 +114,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) return 0; - if (bio->bi_size + nxt->bi_size > q->max_segment_size) + if (bio->bi_seg_back_size + nxt->bi_seg_front_size > + q->max_segment_size) return 0; if (!bio_has_data(bio)) @@ -309,6 +318,8 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { int total_phys_segments; + unsigned int seg_size = + req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; /* * First check if the either of the requests are re-queued @@ -324,8 +335,13 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, return 0; total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; - if (blk_phys_contig_segment(q, req->biotail, next->bio)) + if (blk_phys_contig_segment(q, req->biotail, next->bio)) { + if (req->nr_phys_segments == 1) + req->bio->bi_seg_front_size = seg_size; + if (next->nr_phys_segments == 1) + next->biotail->bi_seg_back_size = seg_size; total_phys_segments--; + } if (total_phys_segments > q->max_phys_segments) return 0; diff --git a/include/linux/bio.h b/include/linux/bio.h index ff5b4cf9e2da..dc3cec386a99 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -79,6 +79,13 @@ struct bio { unsigned int bi_size; /* residual I/O count */ + /* + * To keep track of the max segment size, we account for the + * sizes of the first and last mergeable segments in this bio. + */ + unsigned int bi_seg_front_size; + unsigned int bi_seg_back_size; + unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ unsigned int bi_comp_cpu; /* completion CPU */ From e6d63840ba55ffd3a79aea6792aac6f29f338083 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 Oct 2008 08:49:34 +0200 Subject: [PATCH 05/12] block: fix kernel-doc for blk_alloc_devt() No argument 'gfp_mask' for blk_alloc_devt(). Signed-off-by: Li Zefan Signed-off-by: Jens Axboe --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 4cd3433c99ac..b8defae2ec03 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -358,7 +358,6 @@ static int blk_mangle_minor(int minor) /** * blk_alloc_devt - allocate a dev_t for a partition * @part: partition to allocate dev_t for - * @gfp_mask: memory allocation flag * @devt: out parameter for resulting dev_t * * Allocate a dev_t for block device. From ee2e992cc28553f6c4dd1ab5483c8733c393626b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 Oct 2008 08:49:56 +0200 Subject: [PATCH 06/12] block: simplify string handling in elv_iosched_store() strlcpy() guarantees the dest buffer is NULL teminated. Signed-off-by: Li Zefan Signed-off-by: Jens Axboe --- block/elevator.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index 04518921db31..9482ffa1aae6 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1166,15 +1166,10 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, size_t count) { char elevator_name[ELV_NAME_MAX]; - size_t len; struct elevator_type *e; - elevator_name[sizeof(elevator_name) - 1] = '\0'; - strncpy(elevator_name, name, sizeof(elevator_name) - 1); - len = strlen(elevator_name); - - if (len && elevator_name[len - 1] == '\n') - elevator_name[len - 1] = '\0'; + strlcpy(elevator_name, name, sizeof(elevator_name)); + strstrip(elevator_name); e = elevator_get(elevator_name); if (!e) { From 80a4b58e36b63d7b0b592beb1bd6410aadeeb63c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Oct 2008 09:51:06 +0200 Subject: [PATCH 07/12] block: only call ->request_fn when the queue is not stopped Callers should use either blk_run_queue/__blk_run_queue, or blk_start_queueing() to invoke request handling instead of calling ->request_fn() directly as that does not take the queue stopped flag into account. Also add appropriate comments on the above functions to detail their usage. Signed-off-by: Jens Axboe --- block/blk-core.c | 19 +++++++++++++++++-- block/elevator.c | 7 +++---- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 2d053b584410..91532f2d2fa7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -325,6 +325,9 @@ EXPORT_SYMBOL(blk_unplug); static void blk_invoke_request_fn(struct request_queue *q) { + if (unlikely(blk_queue_stopped(q))) + return; + /* * one level of recursion is ok and is much faster than kicking * the unplug handling @@ -399,8 +402,13 @@ void blk_sync_queue(struct request_queue *q) EXPORT_SYMBOL(blk_sync_queue); /** - * blk_run_queue - run a single device queue + * __blk_run_queue - run a single device queue * @q: The queue to run + * + * Description: + * See @blk_run_queue. This variant must be called with the queue lock + * held and interrupts disabled. + * */ void __blk_run_queue(struct request_queue *q) { @@ -418,6 +426,12 @@ EXPORT_SYMBOL(__blk_run_queue); /** * blk_run_queue - run a single device queue * @q: The queue to run + * + * Description: + * Invoke request handling on this queue, if it has pending work to do. + * May be used to restart queueing when a request has completed. Also + * See @blk_start_queueing. + * */ void blk_run_queue(struct request_queue *q) { @@ -884,7 +898,8 @@ EXPORT_SYMBOL(blk_get_request); * * This is basically a helper to remove the need to know whether a queue * is plugged or not if someone just wants to initiate dispatch of requests - * for this queue. + * for this queue. Should be used to start queueing on a device outside + * of ->request_fn() context. Also see @blk_run_queue. * * The queue lock must be held with interrupts disabled. */ diff --git a/block/elevator.c b/block/elevator.c index 9482ffa1aae6..59173a69ebdf 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -612,7 +612,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) * processing. */ blk_remove_plug(q); - q->request_fn(q); + blk_start_queueing(q); break; case ELEVATOR_INSERT_SORT: @@ -950,7 +950,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq) blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN && blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) { blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0); - q->request_fn(q); + blk_start_queueing(q); } } } @@ -1109,8 +1109,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) elv_drain_elevator(q); while (q->rq.elvpriv) { - blk_remove_plug(q); - q->request_fn(q); + blk_start_queueing(q); spin_unlock_irq(q->queue_lock); msleep(10); spin_lock_irq(q->queue_lock); From 496aa8a98f5ab22ced46be5dc2087cdf3d029bd7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 16 Oct 2008 07:46:23 +0200 Subject: [PATCH 08/12] block: fix current kernel-doc warnings Fix block kernel-doc warnings: Warning(linux-2.6.27-git4//fs/block_dev.c:1272): No description found for parameter 'path' Warning(linux-2.6.27-git4//block/blk-core.c:1021): No description found for parameter 'cpu' Warning(linux-2.6.27-git4//block/blk-core.c:1021): No description found for parameter 'part' Warning(/var/linsrc/linux-2.6.27-git4//block/genhd.c:544): No description found for parameter 'partno' Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++-- block/genhd.c | 2 +- fs/block_dev.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 91532f2d2fa7..8517264eb71e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1018,8 +1018,9 @@ static void part_round_stats_single(int cpu, struct hd_struct *part, } /** - * part_round_stats() - Round off the performance stats on a struct - * disk_stats. + * part_round_stats() - Round off the performance stats on a struct disk_stats. + * @cpu: cpu number for stats access + * @part: target partition * * The average IO queue length and utilisation statistics are maintained * by observing the current state of the queue length and the amount of diff --git a/block/genhd.c b/block/genhd.c index b8defae2ec03..646e1d2507c7 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -534,7 +534,7 @@ void unlink_gendisk(struct gendisk *disk) /** * get_gendisk - get partitioning information for a given device * @devt: device to get partitioning information for - * @part: returned partition index + * @partno: returned partition index * * This function gets the structure containing partitioning * information for the given device @devt. diff --git a/fs/block_dev.c b/fs/block_dev.c index d84f0469a016..218408eed1bb 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1262,7 +1262,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); /** * lookup_bdev - lookup a struct block_device by name - * @pathname: special file representing the block device + * @path: special file representing the block device * * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) From 756f8243188e013bd067811cdc0cc60760abfdf9 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 16 Oct 2008 08:23:21 +0200 Subject: [PATCH 09/12] blktrace: add support for driver data This patch adds the new api call blk_add_driver_data() to blktrace. It allows to trace device driver-specific binary data. Signed-off-by: Stefan Raspl Signed-off-by: Martin Peschke Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 3a31eb506164..bdf505d33e77 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -24,6 +24,7 @@ enum blktrace_cat { BLK_TC_AHEAD = 1 << 11, /* readahead */ BLK_TC_META = 1 << 12, /* metadata */ BLK_TC_DISCARD = 1 << 13, /* discard requests */ + BLK_TC_DRV_DATA = 1 << 14, /* binary per-driver data */ BLK_TC_END = 1 << 15, /* only 16-bits, reminder */ }; @@ -51,6 +52,7 @@ enum blktrace_act { __BLK_TA_BOUNCE, /* bio was bounced */ __BLK_TA_REMAP, /* bio was remapped */ __BLK_TA_ABORT, /* request aborted */ + __BLK_TA_DRV_DATA, /* driver-specific binary data */ }; /* @@ -82,6 +84,7 @@ enum blktrace_notify { #define BLK_TA_BOUNCE (__BLK_TA_BOUNCE) #define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE)) #define BLK_TA_ABORT (__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE)) +#define BLK_TA_DRV_DATA (__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA)) #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) @@ -317,6 +320,34 @@ static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio, __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); } +/** + * blk_add_driver_data - Add binary message with driver-specific data + * @q: queue the io is for + * @rq: io request + * @data: driver-specific data + * @len: length of driver-specific data + * + * Description: + * Some drivers might want to write driver-specific data per request. + * + **/ +static inline void blk_add_driver_data(struct request_queue *q, + struct request *rq, + void *data, size_t len) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + if (blk_pc_request(rq)) + __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, + rq->errors, len, data); + else + __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + 0, BLK_TA_DRV_DATA, rq->errors, len, data); +} + extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, char __user *arg); extern int blk_trace_startstop(struct request_queue *q, int start); @@ -330,6 +361,7 @@ extern int blk_trace_remove(struct request_queue *q); #define blk_add_trace_generic(q, rq, rw, what) do { } while (0) #define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0) #define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0) +#define blk_add_driver_data(q, rq, data, len) do {} while (0) #define do_blk_trace_setup(q, name, dev, buts) (-ENOTTY) #define blk_trace_setup(q, name, dev, arg) (-ENOTTY) #define blk_trace_startstop(q, start) (-ENOTTY) From 0997f1c5fec0b540784611036d458a84a1f7029a Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 16 Oct 2008 08:23:39 +0200 Subject: [PATCH 10/12] blktrace: pass zfcp driver data This patch writes the channel and fabric latencies in nanoseconds per request via blktrace for later analysis. The utilization of the inbound and outbound adapter queue is also reported. Signed-off-by: Stefan Raspl Signed-off-by: Martin Peschke Signed-off-by: Christof Schmitt Signed-off-by: Jens Axboe --- drivers/s390/scsi/zfcp_def.h | 2 ++ drivers/s390/scsi/zfcp_fsf.c | 34 ++++++++++++++++++++++++++++++++++ drivers/s390/scsi/zfcp_fsf.h | 12 ++++++++++++ drivers/s390/scsi/zfcp_qdio.c | 1 + 4 files changed, 49 insertions(+) diff --git a/drivers/s390/scsi/zfcp_def.h b/drivers/s390/scsi/zfcp_def.h index 8a13071c444c..9ce4c75bd190 100644 --- a/drivers/s390/scsi/zfcp_def.h +++ b/drivers/s390/scsi/zfcp_def.h @@ -583,6 +583,8 @@ struct zfcp_fsf_req { unsigned long long issued; /* request sent time (STCK) */ struct zfcp_unit *unit; void (*handler)(struct zfcp_fsf_req *); + u16 qdio_outb_usage;/* usage of outbound queue */ + u16 qdio_inb_usage; /* usage of inbound queue */ }; /* driver data */ diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index 739356a5c123..5ae1d497e5ed 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -6,6 +6,7 @@ * Copyright IBM Corporation 2002, 2008 */ +#include #include "zfcp_ext.h" static void zfcp_fsf_request_timeout_handler(unsigned long data) @@ -777,6 +778,7 @@ static int zfcp_fsf_req_send(struct zfcp_fsf_req *req) list_add_tail(&req->list, &adapter->req_list[idx]); spin_unlock(&adapter->req_list_lock); + req->qdio_outb_usage = atomic_read(&req_q->count); req->issued = get_clock(); if (zfcp_qdio_send(req)) { /* Queues are down..... */ @@ -2082,6 +2084,36 @@ static void zfcp_fsf_req_latency(struct zfcp_fsf_req *req) spin_unlock_irqrestore(&unit->latencies.lock, flags); } +#ifdef CONFIG_BLK_DEV_IO_TRACE +static void zfcp_fsf_trace_latency(struct zfcp_fsf_req *fsf_req) +{ + struct fsf_qual_latency_info *lat_inf; + struct scsi_cmnd *scsi_cmnd = (struct scsi_cmnd *)fsf_req->data; + struct request *req = scsi_cmnd->request; + struct zfcp_blk_drv_data trace; + int ticks = fsf_req->adapter->timer_ticks; + + trace.flags = 0; + trace.magic = ZFCP_BLK_DRV_DATA_MAGIC; + if (fsf_req->adapter->adapter_features & FSF_FEATURE_MEASUREMENT_DATA) { + trace.flags |= ZFCP_BLK_LAT_VALID; + lat_inf = &fsf_req->qtcb->prefix.prot_status_qual.latency_info; + trace.channel_lat = lat_inf->channel_lat * ticks; + trace.fabric_lat = lat_inf->fabric_lat * ticks; + } + if (fsf_req->status & ZFCP_STATUS_FSFREQ_ERROR) + trace.flags |= ZFCP_BLK_REQ_ERROR; + trace.inb_usage = fsf_req->qdio_inb_usage; + trace.outb_usage = fsf_req->qdio_outb_usage; + + blk_add_driver_data(req->q, req, &trace, sizeof(trace)); +} +#else +static inline void zfcp_fsf_trace_latency(struct zfcp_fsf_req *fsf_req) +{ +} +#endif + static void zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *req) { struct scsi_cmnd *scpnt = req->data; @@ -2114,6 +2146,8 @@ static void zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *req) if (req->adapter->adapter_features & FSF_FEATURE_MEASUREMENT_DATA) zfcp_fsf_req_latency(req); + zfcp_fsf_trace_latency(req); + if (unlikely(fcp_rsp_iu->validity.bits.fcp_rsp_len_valid)) { if (fcp_rsp_info[3] == RSP_CODE_GOOD) set_host_byte(scpnt, DID_OK); diff --git a/drivers/s390/scsi/zfcp_fsf.h b/drivers/s390/scsi/zfcp_fsf.h index fd3a88777ac8..fa2a31780611 100644 --- a/drivers/s390/scsi/zfcp_fsf.h +++ b/drivers/s390/scsi/zfcp_fsf.h @@ -439,4 +439,16 @@ struct fsf_qtcb { u8 log[FSF_QTCB_LOG_SIZE]; } __attribute__ ((packed)); +struct zfcp_blk_drv_data { +#define ZFCP_BLK_DRV_DATA_MAGIC 0x1 + u32 magic; +#define ZFCP_BLK_LAT_VALID 0x1 +#define ZFCP_BLK_REQ_ERROR 0x2 + u16 flags; + u8 inb_usage; + u8 outb_usage; + u64 channel_lat; + u64 fabric_lat; +} __attribute__ ((packed)); + #endif /* FSF_H */ diff --git a/drivers/s390/scsi/zfcp_qdio.c b/drivers/s390/scsi/zfcp_qdio.c index 3e05080e62d4..664752f90b20 100644 --- a/drivers/s390/scsi/zfcp_qdio.c +++ b/drivers/s390/scsi/zfcp_qdio.c @@ -115,6 +115,7 @@ static void zfcp_qdio_reqid_check(struct zfcp_adapter *adapter, spin_unlock_irqrestore(&adapter->req_list_lock, flags); fsf_req->sbal_response = sbal_idx; + fsf_req->qdio_inb_usage = atomic_read(&adapter->resp_q.count); zfcp_fsf_req_complete(fsf_req); } From 713ada9ba94f2ad874cffd074b83e3dc681ca82f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Oct 2008 13:44:57 +0200 Subject: [PATCH 11/12] block: move q->unplug_work initialization modprobe loop; rmmod loop effectively creates a blk_queue and destroys it which results in q->unplug_work being canceled without it ever being initialized. Therefore, move the initialization of q->unplug_work from blk_queue_make_request() to blk_alloc_queue*(). Reported-by: Alexey Dobriyan Signed-off-by: Peter Zijlstra Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-settings.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 8517264eb71e..fcbd56dd41fa 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -515,6 +515,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) init_timer(&q->unplug_timer); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); INIT_LIST_HEAD(&q->timeout_list); + INIT_WORK(&q->unplug_work, blk_unplug_work); kobject_init(&q->kobj, &blk_queue_ktype); diff --git a/block/blk-settings.c b/block/blk-settings.c index b21dcdb64151..41392fbe19ff 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -141,8 +141,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) if (q->unplug_delay == 0) q->unplug_delay = 1; - INIT_WORK(&q->unplug_work, blk_unplug_work); - q->unplug_timer.function = blk_unplug_timeout; q->unplug_timer.data = (unsigned long)q; From f73e2d13a16cc88c4faa4729967f92bfeec8a142 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Oct 2008 14:03:08 +0200 Subject: [PATCH 12/12] block: remove __generic_unplug_device() from exports The only out-of-core user is IDE, and that should be using blk_start_queueing() instead. Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - block/blk.h | 1 + drivers/ide/ide-io.c | 4 ++-- include/linux/blkdev.h | 1 - 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index fcbd56dd41fa..81a496004976 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -257,7 +257,6 @@ void __generic_unplug_device(struct request_queue *q) q->request_fn(q); } -EXPORT_SYMBOL(__generic_unplug_device); /** * generic_unplug_device - fire a request queue diff --git a/block/blk.h b/block/blk.h index e5c579769963..d2e49af90db5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -20,6 +20,7 @@ void blk_unplug_timeout(unsigned long data); void blk_rq_timed_out_timer(unsigned long data); void blk_delete_timer(struct request *); void blk_add_timer(struct request *); +void __generic_unplug_device(struct request_queue *); /* * Internal atomic flags for request handling diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 77c6eaeacefa..7162d67562af 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -1493,8 +1493,8 @@ void ide_do_drive_cmd(ide_drive_t *drive, struct request *rq) spin_lock_irqsave(&ide_lock, flags); hwgroup->rq = NULL; - __elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 1); - __generic_unplug_device(drive->queue); + __elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0); + blk_start_queueing(drive->queue); spin_unlock_irqrestore(&ide_lock, flags); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a92d9e4ea96e..8eed8b15f992 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -856,7 +856,6 @@ extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); extern void generic_unplug_device(struct request_queue *); -extern void __generic_unplug_device(struct request_queue *); extern long nr_blockdev_pages(void); int blk_get_queue(struct request_queue *);