mirror of
https://github.com/torvalds/linux.git
synced 2024-12-25 20:32:22 +00:00
block: remove legacy IO schedulers
Retain the deadline documentation, as that carries over to mq-deadline as well. Tested-by: Ming Lei <ming.lei@redhat.com> Reviewed-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
404b8f5a03
commit
f382fb0bce
@ -1,291 +0,0 @@
|
||||
CFQ (Complete Fairness Queueing)
|
||||
===============================
|
||||
|
||||
The main aim of CFQ scheduler is to provide a fair allocation of the disk
|
||||
I/O bandwidth for all the processes which requests an I/O operation.
|
||||
|
||||
CFQ maintains the per process queue for the processes which request I/O
|
||||
operation(synchronous requests). In case of asynchronous requests, all the
|
||||
requests from all the processes are batched together according to their
|
||||
process's I/O priority.
|
||||
|
||||
CFQ ioscheduler tunables
|
||||
========================
|
||||
|
||||
slice_idle
|
||||
----------
|
||||
This specifies how long CFQ should idle for next request on certain cfq queues
|
||||
(for sequential workloads) and service trees (for random workloads) before
|
||||
queue is expired and CFQ selects next queue to dispatch from.
|
||||
|
||||
By default slice_idle is a non-zero value. That means by default we idle on
|
||||
queues/service trees. This can be very helpful on highly seeky media like
|
||||
single spindle SATA/SAS disks where we can cut down on overall number of
|
||||
seeks and see improved throughput.
|
||||
|
||||
Setting slice_idle to 0 will remove all the idling on queues/service tree
|
||||
level and one should see an overall improved throughput on faster storage
|
||||
devices like multiple SATA/SAS disks in hardware RAID configuration. The down
|
||||
side is that isolation provided from WRITES also goes down and notion of
|
||||
IO priority becomes weaker.
|
||||
|
||||
So depending on storage and workload, it might be useful to set slice_idle=0.
|
||||
In general I think for SATA/SAS disks and software RAID of SATA/SAS disks
|
||||
keeping slice_idle enabled should be useful. For any configurations where
|
||||
there are multiple spindles behind single LUN (Host based hardware RAID
|
||||
controller or for storage arrays), setting slice_idle=0 might end up in better
|
||||
throughput and acceptable latencies.
|
||||
|
||||
back_seek_max
|
||||
-------------
|
||||
This specifies, given in Kbytes, the maximum "distance" for backward seeking.
|
||||
The distance is the amount of space from the current head location to the
|
||||
sectors that are backward in terms of distance.
|
||||
|
||||
This parameter allows the scheduler to anticipate requests in the "backward"
|
||||
direction and consider them as being the "next" if they are within this
|
||||
distance from the current head location.
|
||||
|
||||
back_seek_penalty
|
||||
-----------------
|
||||
This parameter is used to compute the cost of backward seeking. If the
|
||||
backward distance of request is just 1/back_seek_penalty from a "front"
|
||||
request, then the seeking cost of two requests is considered equivalent.
|
||||
|
||||
So scheduler will not bias toward one or the other request (otherwise scheduler
|
||||
will bias toward front request). Default value of back_seek_penalty is 2.
|
||||
|
||||
fifo_expire_async
|
||||
-----------------
|
||||
This parameter is used to set the timeout of asynchronous requests. Default
|
||||
value of this is 248ms.
|
||||
|
||||
fifo_expire_sync
|
||||
----------------
|
||||
This parameter is used to set the timeout of synchronous requests. Default
|
||||
value of this is 124ms. In case to favor synchronous requests over asynchronous
|
||||
one, this value should be decreased relative to fifo_expire_async.
|
||||
|
||||
group_idle
|
||||
-----------
|
||||
This parameter forces idling at the CFQ group level instead of CFQ
|
||||
queue level. This was introduced after a bottleneck was observed
|
||||
in higher end storage due to idle on sequential queue and allow dispatch
|
||||
from a single queue. The idea with this parameter is that it can be run with
|
||||
slice_idle=0 and group_idle=8, so that idling does not happen on individual
|
||||
queues in the group but happens overall on the group and thus still keeps the
|
||||
IO controller working.
|
||||
Not idling on individual queues in the group will dispatch requests from
|
||||
multiple queues in the group at the same time and achieve higher throughput
|
||||
on higher end storage.
|
||||
|
||||
Default value for this parameter is 8ms.
|
||||
|
||||
low_latency
|
||||
-----------
|
||||
This parameter is used to enable/disable the low latency mode of the CFQ
|
||||
scheduler. If enabled, CFQ tries to recompute the slice time for each process
|
||||
based on the target_latency set for the system. This favors fairness over
|
||||
throughput. Disabling low latency (setting it to 0) ignores target latency,
|
||||
allowing each process in the system to get a full time slice.
|
||||
|
||||
By default low latency mode is enabled.
|
||||
|
||||
target_latency
|
||||
--------------
|
||||
This parameter is used to calculate the time slice for a process if cfq's
|
||||
latency mode is enabled. It will ensure that sync requests have an estimated
|
||||
latency. But if sequential workload is higher(e.g. sequential read),
|
||||
then to meet the latency constraints, throughput may decrease because of less
|
||||
time for each process to issue I/O request before the cfq queue is switched.
|
||||
|
||||
Though this can be overcome by disabling the latency_mode, it may increase
|
||||
the read latency for some applications. This parameter allows for changing
|
||||
target_latency through the sysfs interface which can provide the balanced
|
||||
throughput and read latency.
|
||||
|
||||
Default value for target_latency is 300ms.
|
||||
|
||||
slice_async
|
||||
-----------
|
||||
This parameter is same as of slice_sync but for asynchronous queue. The
|
||||
default value is 40ms.
|
||||
|
||||
slice_async_rq
|
||||
--------------
|
||||
This parameter is used to limit the dispatching of asynchronous request to
|
||||
device request queue in queue's slice time. The maximum number of request that
|
||||
are allowed to be dispatched also depends upon the io priority. Default value
|
||||
for this is 2.
|
||||
|
||||
slice_sync
|
||||
----------
|
||||
When a queue is selected for execution, the queues IO requests are only
|
||||
executed for a certain amount of time(time_slice) before switching to another
|
||||
queue. This parameter is used to calculate the time slice of synchronous
|
||||
queue.
|
||||
|
||||
time_slice is computed using the below equation:-
|
||||
time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the
|
||||
time_slice of synchronous queue, increase the value of slice_sync. Default
|
||||
value is 100ms.
|
||||
|
||||
quantum
|
||||
-------
|
||||
This specifies the number of request dispatched to the device queue. In a
|
||||
queue's time slice, a request will not be dispatched if the number of request
|
||||
in the device exceeds this parameter. This parameter is used for synchronous
|
||||
request.
|
||||
|
||||
In case of storage with several disk, this setting can limit the parallel
|
||||
processing of request. Therefore, increasing the value can improve the
|
||||
performance although this can cause the latency of some I/O to increase due
|
||||
to more number of requests.
|
||||
|
||||
CFQ Group scheduling
|
||||
====================
|
||||
|
||||
CFQ supports blkio cgroup and has "blkio." prefixed files in each
|
||||
blkio cgroup directory. It is weight-based and there are four knobs
|
||||
for configuration - weight[_device] and leaf_weight[_device].
|
||||
Internal cgroup nodes (the ones with children) can also have tasks in
|
||||
them, so the former two configure how much proportion the cgroup as a
|
||||
whole is entitled to at its parent's level while the latter two
|
||||
configure how much proportion the tasks in the cgroup have compared to
|
||||
its direct children.
|
||||
|
||||
Another way to think about it is assuming that each internal node has
|
||||
an implicit leaf child node which hosts all the tasks whose weight is
|
||||
configured by leaf_weight[_device]. Let's assume a blkio hierarchy
|
||||
composed of five cgroups - root, A, B, AA and AB - with the following
|
||||
weights where the names represent the hierarchy.
|
||||
|
||||
weight leaf_weight
|
||||
root : 125 125
|
||||
A : 500 750
|
||||
B : 250 500
|
||||
AA : 500 500
|
||||
AB : 1000 500
|
||||
|
||||
root never has a parent making its weight is meaningless. For backward
|
||||
compatibility, weight is always kept in sync with leaf_weight. B, AA
|
||||
and AB have no child and thus its tasks have no children cgroup to
|
||||
compete with. They always get 100% of what the cgroup won at the
|
||||
parent level. Considering only the weights which matter, the hierarchy
|
||||
looks like the following.
|
||||
|
||||
root
|
||||
/ | \
|
||||
A B leaf
|
||||
500 250 125
|
||||
/ | \
|
||||
AA AB leaf
|
||||
500 1000 750
|
||||
|
||||
If all cgroups have active IOs and competing with each other, disk
|
||||
time will be distributed like the following.
|
||||
|
||||
Distribution below root. The total active weight at this level is
|
||||
A:500 + B:250 + C:125 = 875.
|
||||
|
||||
root-leaf : 125 / 875 =~ 14%
|
||||
A : 500 / 875 =~ 57%
|
||||
B(-leaf) : 250 / 875 =~ 28%
|
||||
|
||||
A has children and further distributes its 57% among the children and
|
||||
the implicit leaf node. The total active weight at this level is
|
||||
AA:500 + AB:1000 + A-leaf:750 = 2250.
|
||||
|
||||
A-leaf : ( 750 / 2250) * A =~ 19%
|
||||
AA(-leaf) : ( 500 / 2250) * A =~ 12%
|
||||
AB(-leaf) : (1000 / 2250) * A =~ 25%
|
||||
|
||||
CFQ IOPS Mode for group scheduling
|
||||
===================================
|
||||
Basic CFQ design is to provide priority based time slices. Higher priority
|
||||
process gets bigger time slice and lower priority process gets smaller time
|
||||
slice. Measuring time becomes harder if storage is fast and supports NCQ and
|
||||
it would be better to dispatch multiple requests from multiple cfq queues in
|
||||
request queue at a time. In such scenario, it is not possible to measure time
|
||||
consumed by single queue accurately.
|
||||
|
||||
What is possible though is to measure number of requests dispatched from a
|
||||
single queue and also allow dispatch from multiple cfq queue at the same time.
|
||||
This effectively becomes the fairness in terms of IOPS (IO operations per
|
||||
second).
|
||||
|
||||
If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches
|
||||
to IOPS mode and starts providing fairness in terms of number of requests
|
||||
dispatched. Note that this mode switching takes effect only for group
|
||||
scheduling. For non-cgroup users nothing should change.
|
||||
|
||||
CFQ IO scheduler Idling Theory
|
||||
===============================
|
||||
Idling on a queue is primarily about waiting for the next request to come
|
||||
on same queue after completion of a request. In this process CFQ will not
|
||||
dispatch requests from other cfq queues even if requests are pending there.
|
||||
|
||||
The rationale behind idling is that it can cut down on number of seeks
|
||||
on rotational media. For example, if a process is doing dependent
|
||||
sequential reads (next read will come on only after completion of previous
|
||||
one), then not dispatching request from other queue should help as we
|
||||
did not move the disk head and kept on dispatching sequential IO from
|
||||
one queue.
|
||||
|
||||
CFQ has following service trees and various queues are put on these trees.
|
||||
|
||||
sync-idle sync-noidle async
|
||||
|
||||
All cfq queues doing synchronous sequential IO go on to sync-idle tree.
|
||||
On this tree we idle on each queue individually.
|
||||
|
||||
All synchronous non-sequential queues go on sync-noidle tree. Also any
|
||||
synchronous write request which is not marked with REQ_IDLE goes on this
|
||||
service tree. On this tree we do not idle on individual queues instead idle
|
||||
on the whole group of queues or the tree. So if there are 4 queues waiting
|
||||
for IO to dispatch we will idle only once last queue has dispatched the IO
|
||||
and there is no more IO on this service tree.
|
||||
|
||||
All async writes go on async service tree. There is no idling on async
|
||||
queues.
|
||||
|
||||
CFQ has some optimizations for SSDs and if it detects a non-rotational
|
||||
media which can support higher queue depth (multiple requests at in
|
||||
flight at a time), then it cuts down on idling of individual queues and
|
||||
all the queues move to sync-noidle tree and only tree idle remains. This
|
||||
tree idling provides isolation with buffered write queues on async tree.
|
||||
|
||||
FAQ
|
||||
===
|
||||
Q1. Why to idle at all on queues not marked with REQ_IDLE.
|
||||
|
||||
A1. We only do tree idle (all queues on sync-noidle tree) on queues not marked
|
||||
with REQ_IDLE. This helps in providing isolation with all the sync-idle
|
||||
queues. Otherwise in presence of many sequential readers, other
|
||||
synchronous IO might not get fair share of disk.
|
||||
|
||||
For example, if there are 10 sequential readers doing IO and they get
|
||||
100ms each. If a !REQ_IDLE request comes in, it will be scheduled
|
||||
roughly after 1 second. If after completion of !REQ_IDLE request we
|
||||
do not idle, and after a couple of milli seconds a another !REQ_IDLE
|
||||
request comes in, again it will be scheduled after 1second. Repeat it
|
||||
and notice how a workload can lose its disk share and suffer due to
|
||||
multiple sequential readers.
|
||||
|
||||
fsync can generate dependent IO where bunch of data is written in the
|
||||
context of fsync, and later some journaling data is written. Journaling
|
||||
data comes in only after fsync has finished its IO (atleast for ext4
|
||||
that seemed to be the case). Now if one decides not to idle on fsync
|
||||
thread due to !REQ_IDLE, then next journaling write will not get
|
||||
scheduled for another second. A process doing small fsync, will suffer
|
||||
badly in presence of multiple sequential readers.
|
||||
|
||||
Hence doing tree idling on threads using !REQ_IDLE flag on requests
|
||||
provides isolation from multiple sequential readers and at the same
|
||||
time we do not idle on individual threads.
|
||||
|
||||
Q2. When to specify REQ_IDLE
|
||||
A2. I would think whenever one is doing synchronous write and expecting
|
||||
more writes to be dispatched from same context soon, should be able
|
||||
to specify REQ_IDLE on writes and that probably should work well for
|
||||
most of the cases.
|
@ -3,67 +3,6 @@ if BLOCK
|
||||
|
||||
menu "IO Schedulers"
|
||||
|
||||
config IOSCHED_NOOP
|
||||
bool
|
||||
default y
|
||||
---help---
|
||||
The no-op I/O scheduler is a minimal scheduler that does basic merging
|
||||
and sorting. Its main uses include non-disk based block devices like
|
||||
memory devices, and specialised software or hardware environments
|
||||
that do their own scheduling and require only minimal assistance from
|
||||
the kernel.
|
||||
|
||||
config IOSCHED_DEADLINE
|
||||
tristate "Deadline I/O scheduler"
|
||||
default y
|
||||
---help---
|
||||
The deadline I/O scheduler is simple and compact. It will provide
|
||||
CSCAN service with FIFO expiration of requests, switching to
|
||||
a new point in the service tree and doing a batch of IO from there
|
||||
in case of expiry.
|
||||
|
||||
config IOSCHED_CFQ
|
||||
tristate "CFQ I/O scheduler"
|
||||
default y
|
||||
---help---
|
||||
The CFQ I/O scheduler tries to distribute bandwidth equally
|
||||
among all processes in the system. It should provide a fair
|
||||
and low latency working environment, suitable for both desktop
|
||||
and server systems.
|
||||
|
||||
This is the default I/O scheduler.
|
||||
|
||||
config CFQ_GROUP_IOSCHED
|
||||
bool "CFQ Group Scheduling support"
|
||||
depends on IOSCHED_CFQ && BLK_CGROUP
|
||||
---help---
|
||||
Enable group IO scheduling in CFQ.
|
||||
|
||||
choice
|
||||
|
||||
prompt "Default I/O scheduler"
|
||||
default DEFAULT_CFQ
|
||||
help
|
||||
Select the I/O scheduler which will be used by default for all
|
||||
block devices.
|
||||
|
||||
config DEFAULT_DEADLINE
|
||||
bool "Deadline" if IOSCHED_DEADLINE=y
|
||||
|
||||
config DEFAULT_CFQ
|
||||
bool "CFQ" if IOSCHED_CFQ=y
|
||||
|
||||
config DEFAULT_NOOP
|
||||
bool "No-op"
|
||||
|
||||
endchoice
|
||||
|
||||
config DEFAULT_IOSCHED
|
||||
string
|
||||
default "deadline" if DEFAULT_DEADLINE
|
||||
default "cfq" if DEFAULT_CFQ
|
||||
default "noop" if DEFAULT_NOOP
|
||||
|
||||
config MQ_IOSCHED_DEADLINE
|
||||
tristate "MQ deadline I/O scheduler"
|
||||
default y
|
||||
|
@ -18,9 +18,6 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
|
||||
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
|
||||
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
|
||||
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
|
||||
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
|
||||
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
|
||||
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
|
||||
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
|
||||
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
|
||||
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
|
||||
|
4916
block/cfq-iosched.c
4916
block/cfq-iosched.c
File diff suppressed because it is too large
Load Diff
@ -1,560 +0,0 @@
|
||||
/*
|
||||
* Deadline i/o scheduler.
|
||||
*
|
||||
* Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/elevator.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/rbtree.h>
|
||||
|
||||
/*
|
||||
* See Documentation/block/deadline-iosched.txt
|
||||
*/
|
||||
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
|
||||
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
|
||||
static const int writes_starved = 2; /* max times reads can starve a write */
|
||||
static const int fifo_batch = 16; /* # of sequential requests treated as one
|
||||
by the above parameters. For throughput. */
|
||||
|
||||
struct deadline_data {
|
||||
/*
|
||||
* run time data
|
||||
*/
|
||||
|
||||
/*
|
||||
* requests (deadline_rq s) are present on both sort_list and fifo_list
|
||||
*/
|
||||
struct rb_root sort_list[2];
|
||||
struct list_head fifo_list[2];
|
||||
|
||||
/*
|
||||
* next in sort order. read, write or both are NULL
|
||||
*/
|
||||
struct request *next_rq[2];
|
||||
unsigned int batching; /* number of sequential requests made */
|
||||
unsigned int starved; /* times reads have starved writes */
|
||||
|
||||
/*
|
||||
* settings that change how the i/o scheduler behaves
|
||||
*/
|
||||
int fifo_expire[2];
|
||||
int fifo_batch;
|
||||
int writes_starved;
|
||||
int front_merges;
|
||||
};
|
||||
|
||||
static inline struct rb_root *
|
||||
deadline_rb_root(struct deadline_data *dd, struct request *rq)
|
||||
{
|
||||
return &dd->sort_list[rq_data_dir(rq)];
|
||||
}
|
||||
|
||||
/*
|
||||
* get the request after `rq' in sector-sorted order
|
||||
*/
|
||||
static inline struct request *
|
||||
deadline_latter_request(struct request *rq)
|
||||
{
|
||||
struct rb_node *node = rb_next(&rq->rb_node);
|
||||
|
||||
if (node)
|
||||
return rb_entry_rq(node);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
|
||||
{
|
||||
struct rb_root *root = deadline_rb_root(dd, rq);
|
||||
|
||||
elv_rb_add(root, rq);
|
||||
}
|
||||
|
||||
static inline void
|
||||
deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
|
||||
{
|
||||
const int data_dir = rq_data_dir(rq);
|
||||
|
||||
if (dd->next_rq[data_dir] == rq)
|
||||
dd->next_rq[data_dir] = deadline_latter_request(rq);
|
||||
|
||||
elv_rb_del(deadline_rb_root(dd, rq), rq);
|
||||
}
|
||||
|
||||
/*
|
||||
* add rq to rbtree and fifo
|
||||
*/
|
||||
static void
|
||||
deadline_add_request(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
const int data_dir = rq_data_dir(rq);
|
||||
|
||||
/*
|
||||
* This may be a requeue of a write request that has locked its
|
||||
* target zone. If it is the case, this releases the zone lock.
|
||||
*/
|
||||
blk_req_zone_write_unlock(rq);
|
||||
|
||||
deadline_add_rq_rb(dd, rq);
|
||||
|
||||
/*
|
||||
* set expire time and add to fifo list
|
||||
*/
|
||||
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
|
||||
list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
|
||||
}
|
||||
|
||||
/*
|
||||
* remove rq from rbtree and fifo.
|
||||
*/
|
||||
static void deadline_remove_request(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
|
||||
rq_fifo_clear(rq);
|
||||
deadline_del_rq_rb(dd, rq);
|
||||
}
|
||||
|
||||
static enum elv_merge
|
||||
deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
|
||||
{
|
||||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
struct request *__rq;
|
||||
|
||||
/*
|
||||
* check for front merge
|
||||
*/
|
||||
if (dd->front_merges) {
|
||||
sector_t sector = bio_end_sector(bio);
|
||||
|
||||
__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
|
||||
if (__rq) {
|
||||
BUG_ON(sector != blk_rq_pos(__rq));
|
||||
|
||||
if (elv_bio_merge_ok(__rq, bio)) {
|
||||
*req = __rq;
|
||||
return ELEVATOR_FRONT_MERGE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ELEVATOR_NO_MERGE;
|
||||
}
|
||||
|
||||
static void deadline_merged_request(struct request_queue *q,
|
||||
struct request *req, enum elv_merge type)
|
||||
{
|
||||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
|
||||
/*
|
||||
* if the merge was a front merge, we need to reposition request
|
||||
*/
|
||||
if (type == ELEVATOR_FRONT_MERGE) {
|
||||
elv_rb_del(deadline_rb_root(dd, req), req);
|
||||
deadline_add_rq_rb(dd, req);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
deadline_merged_requests(struct request_queue *q, struct request *req,
|
||||
struct request *next)
|
||||
{
|
||||
/*
|
||||
* if next expires before rq, assign its expire time to rq
|
||||
* and move into next position (next will be deleted) in fifo
|
||||
*/
|
||||
if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
|
||||
if (time_before((unsigned long)next->fifo_time,
|
||||
(unsigned long)req->fifo_time)) {
|
||||
list_move(&req->queuelist, &next->queuelist);
|
||||
req->fifo_time = next->fifo_time;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* kill knowledge of next, this one is a goner
|
||||
*/
|
||||
deadline_remove_request(q, next);
|
||||
}
|
||||
|
||||
/*
|
||||
* move request from sort list to dispatch queue.
|
||||
*/
|
||||
static inline void
|
||||
deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
|
||||
{
|
||||
struct request_queue *q = rq->q;
|
||||
|
||||
/*
|
||||
* For a zoned block device, write requests must write lock their
|
||||
* target zone.
|
||||
*/
|
||||
blk_req_zone_write_lock(rq);
|
||||
|
||||
deadline_remove_request(q, rq);
|
||||
elv_dispatch_add_tail(q, rq);
|
||||
}
|
||||
|
||||
/*
|
||||
* move an entry to dispatch queue
|
||||
*/
|
||||
static void
|
||||
deadline_move_request(struct deadline_data *dd, struct request *rq)
|
||||
{
|
||||
const int data_dir = rq_data_dir(rq);
|
||||
|
||||
dd->next_rq[READ] = NULL;
|
||||
dd->next_rq[WRITE] = NULL;
|
||||
dd->next_rq[data_dir] = deadline_latter_request(rq);
|
||||
|
||||
/*
|
||||
* take it off the sort and fifo list, move
|
||||
* to dispatch queue
|
||||
*/
|
||||
deadline_move_to_dispatch(dd, rq);
|
||||
}
|
||||
|
||||
/*
|
||||
* deadline_check_fifo returns 0 if there are no expired requests on the fifo,
|
||||
* 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
|
||||
*/
|
||||
static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
|
||||
{
|
||||
struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
|
||||
|
||||
/*
|
||||
* rq is expired!
|
||||
*/
|
||||
if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* For the specified data direction, return the next request to dispatch using
|
||||
* arrival ordered lists.
|
||||
*/
|
||||
static struct request *
|
||||
deadline_fifo_request(struct deadline_data *dd, int data_dir)
|
||||
{
|
||||
struct request *rq;
|
||||
|
||||
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
|
||||
return NULL;
|
||||
|
||||
if (list_empty(&dd->fifo_list[data_dir]))
|
||||
return NULL;
|
||||
|
||||
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
|
||||
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
|
||||
return rq;
|
||||
|
||||
/*
|
||||
* Look for a write request that can be dispatched, that is one with
|
||||
* an unlocked target zone.
|
||||
*/
|
||||
list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
|
||||
if (blk_req_can_dispatch_to_zone(rq))
|
||||
return rq;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* For the specified data direction, return the next request to dispatch using
|
||||
* sector position sorted lists.
|
||||
*/
|
||||
static struct request *
|
||||
deadline_next_request(struct deadline_data *dd, int data_dir)
|
||||
{
|
||||
struct request *rq;
|
||||
|
||||
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
|
||||
return NULL;
|
||||
|
||||
rq = dd->next_rq[data_dir];
|
||||
if (!rq)
|
||||
return NULL;
|
||||
|
||||
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
|
||||
return rq;
|
||||
|
||||
/*
|
||||
* Look for a write request that can be dispatched, that is one with
|
||||
* an unlocked target zone.
|
||||
*/
|
||||
while (rq) {
|
||||
if (blk_req_can_dispatch_to_zone(rq))
|
||||
return rq;
|
||||
rq = deadline_latter_request(rq);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* deadline_dispatch_requests selects the best request according to
|
||||
* read/write expire, fifo_batch, etc
|
||||
*/
|
||||
static int deadline_dispatch_requests(struct request_queue *q, int force)
|
||||
{
|
||||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
const int reads = !list_empty(&dd->fifo_list[READ]);
|
||||
const int writes = !list_empty(&dd->fifo_list[WRITE]);
|
||||
struct request *rq, *next_rq;
|
||||
int data_dir;
|
||||
|
||||
/*
|
||||
* batches are currently reads XOR writes
|
||||
*/
|
||||
rq = deadline_next_request(dd, WRITE);
|
||||
if (!rq)
|
||||
rq = deadline_next_request(dd, READ);
|
||||
|
||||
if (rq && dd->batching < dd->fifo_batch)
|
||||
/* we have a next request are still entitled to batch */
|
||||
goto dispatch_request;
|
||||
|
||||
/*
|
||||
* at this point we are not running a batch. select the appropriate
|
||||
* data direction (read / write)
|
||||
*/
|
||||
|
||||
if (reads) {
|
||||
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
|
||||
|
||||
if (deadline_fifo_request(dd, WRITE) &&
|
||||
(dd->starved++ >= dd->writes_starved))
|
||||
goto dispatch_writes;
|
||||
|
||||
data_dir = READ;
|
||||
|
||||
goto dispatch_find_request;
|
||||
}
|
||||
|
||||
/*
|
||||
* there are either no reads or writes have been starved
|
||||
*/
|
||||
|
||||
if (writes) {
|
||||
dispatch_writes:
|
||||
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
|
||||
|
||||
dd->starved = 0;
|
||||
|
||||
data_dir = WRITE;
|
||||
|
||||
goto dispatch_find_request;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
dispatch_find_request:
|
||||
/*
|
||||
* we are not running a batch, find best request for selected data_dir
|
||||
*/
|
||||
next_rq = deadline_next_request(dd, data_dir);
|
||||
if (deadline_check_fifo(dd, data_dir) || !next_rq) {
|
||||
/*
|
||||
* A deadline has expired, the last request was in the other
|
||||
* direction, or we have run out of higher-sectored requests.
|
||||
* Start again from the request with the earliest expiry time.
|
||||
*/
|
||||
rq = deadline_fifo_request(dd, data_dir);
|
||||
} else {
|
||||
/*
|
||||
* The last req was the same dir and we have a next request in
|
||||
* sort order. No expired requests so continue on from here.
|
||||
*/
|
||||
rq = next_rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* For a zoned block device, if we only have writes queued and none of
|
||||
* them can be dispatched, rq will be NULL.
|
||||
*/
|
||||
if (!rq)
|
||||
return 0;
|
||||
|
||||
dd->batching = 0;
|
||||
|
||||
dispatch_request:
|
||||
/*
|
||||
* rq is the selected appropriate request.
|
||||
*/
|
||||
dd->batching++;
|
||||
deadline_move_request(dd, rq);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* For zoned block devices, write unlock the target zone of completed
|
||||
* write requests.
|
||||
*/
|
||||
static void
|
||||
deadline_completed_request(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
blk_req_zone_write_unlock(rq);
|
||||
}
|
||||
|
||||
static void deadline_exit_queue(struct elevator_queue *e)
|
||||
{
|
||||
struct deadline_data *dd = e->elevator_data;
|
||||
|
||||
BUG_ON(!list_empty(&dd->fifo_list[READ]));
|
||||
BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
|
||||
|
||||
kfree(dd);
|
||||
}
|
||||
|
||||
/*
|
||||
* initialize elevator private data (deadline_data).
|
||||
*/
|
||||
static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
|
||||
{
|
||||
struct deadline_data *dd;
|
||||
struct elevator_queue *eq;
|
||||
|
||||
eq = elevator_alloc(q, e);
|
||||
if (!eq)
|
||||
return -ENOMEM;
|
||||
|
||||
dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
|
||||
if (!dd) {
|
||||
kobject_put(&eq->kobj);
|
||||
return -ENOMEM;
|
||||
}
|
||||
eq->elevator_data = dd;
|
||||
|
||||
INIT_LIST_HEAD(&dd->fifo_list[READ]);
|
||||
INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
|
||||
dd->sort_list[READ] = RB_ROOT;
|
||||
dd->sort_list[WRITE] = RB_ROOT;
|
||||
dd->fifo_expire[READ] = read_expire;
|
||||
dd->fifo_expire[WRITE] = write_expire;
|
||||
dd->writes_starved = writes_starved;
|
||||
dd->front_merges = 1;
|
||||
dd->fifo_batch = fifo_batch;
|
||||
|
||||
spin_lock_irq(q->queue_lock);
|
||||
q->elevator = eq;
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* sysfs parts below
|
||||
*/
|
||||
|
||||
static ssize_t
|
||||
deadline_var_show(int var, char *page)
|
||||
{
|
||||
return sprintf(page, "%d\n", var);
|
||||
}
|
||||
|
||||
static void
|
||||
deadline_var_store(int *var, const char *page)
|
||||
{
|
||||
char *p = (char *) page;
|
||||
|
||||
*var = simple_strtol(p, &p, 10);
|
||||
}
|
||||
|
||||
#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
|
||||
static ssize_t __FUNC(struct elevator_queue *e, char *page) \
|
||||
{ \
|
||||
struct deadline_data *dd = e->elevator_data; \
|
||||
int __data = __VAR; \
|
||||
if (__CONV) \
|
||||
__data = jiffies_to_msecs(__data); \
|
||||
return deadline_var_show(__data, (page)); \
|
||||
}
|
||||
SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
|
||||
SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
|
||||
SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
|
||||
SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
|
||||
SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
|
||||
#undef SHOW_FUNCTION
|
||||
|
||||
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
|
||||
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
|
||||
{ \
|
||||
struct deadline_data *dd = e->elevator_data; \
|
||||
int __data; \
|
||||
deadline_var_store(&__data, (page)); \
|
||||
if (__data < (MIN)) \
|
||||
__data = (MIN); \
|
||||
else if (__data > (MAX)) \
|
||||
__data = (MAX); \
|
||||
if (__CONV) \
|
||||
*(__PTR) = msecs_to_jiffies(__data); \
|
||||
else \
|
||||
*(__PTR) = __data; \
|
||||
return count; \
|
||||
}
|
||||
STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
|
||||
STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
|
||||
STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
|
||||
STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
|
||||
STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
|
||||
#undef STORE_FUNCTION
|
||||
|
||||
#define DD_ATTR(name) \
|
||||
__ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
|
||||
|
||||
static struct elv_fs_entry deadline_attrs[] = {
|
||||
DD_ATTR(read_expire),
|
||||
DD_ATTR(write_expire),
|
||||
DD_ATTR(writes_starved),
|
||||
DD_ATTR(front_merges),
|
||||
DD_ATTR(fifo_batch),
|
||||
__ATTR_NULL
|
||||
};
|
||||
|
||||
static struct elevator_type iosched_deadline = {
|
||||
.ops.sq = {
|
||||
.elevator_merge_fn = deadline_merge,
|
||||
.elevator_merged_fn = deadline_merged_request,
|
||||
.elevator_merge_req_fn = deadline_merged_requests,
|
||||
.elevator_dispatch_fn = deadline_dispatch_requests,
|
||||
.elevator_completed_req_fn = deadline_completed_request,
|
||||
.elevator_add_req_fn = deadline_add_request,
|
||||
.elevator_former_req_fn = elv_rb_former_request,
|
||||
.elevator_latter_req_fn = elv_rb_latter_request,
|
||||
.elevator_init_fn = deadline_init_queue,
|
||||
.elevator_exit_fn = deadline_exit_queue,
|
||||
},
|
||||
|
||||
.elevator_attrs = deadline_attrs,
|
||||
.elevator_name = "deadline",
|
||||
.elevator_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init deadline_init(void)
|
||||
{
|
||||
return elv_register(&iosched_deadline);
|
||||
}
|
||||
|
||||
static void __exit deadline_exit(void)
|
||||
{
|
||||
elv_unregister(&iosched_deadline);
|
||||
}
|
||||
|
||||
module_init(deadline_init);
|
||||
module_exit(deadline_exit);
|
||||
|
||||
MODULE_AUTHOR("Jens Axboe");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("deadline IO scheduler");
|
@ -225,8 +225,6 @@ int elevator_init(struct request_queue *q)
|
||||
chosen_elevator);
|
||||
}
|
||||
|
||||
if (!e)
|
||||
e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
|
||||
if (!e) {
|
||||
printk(KERN_ERR
|
||||
"Default I/O scheduler not found. Using noop.\n");
|
||||
@ -356,68 +354,6 @@ struct request *elv_rb_find(struct rb_root *root, sector_t sector)
|
||||
}
|
||||
EXPORT_SYMBOL(elv_rb_find);
|
||||
|
||||
/*
|
||||
* Insert rq into dispatch queue of q. Queue lock must be held on
|
||||
* entry. rq is sort instead into the dispatch queue. To be used by
|
||||
* specific elevators.
|
||||
*/
|
||||
void elv_dispatch_sort(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
sector_t boundary;
|
||||
struct list_head *entry;
|
||||
|
||||
if (q->last_merge == rq)
|
||||
q->last_merge = NULL;
|
||||
|
||||
elv_rqhash_del(q, rq);
|
||||
|
||||
q->nr_sorted--;
|
||||
|
||||
boundary = q->end_sector;
|
||||
list_for_each_prev(entry, &q->queue_head) {
|
||||
struct request *pos = list_entry_rq(entry);
|
||||
|
||||
if (req_op(rq) != req_op(pos))
|
||||
break;
|
||||
if (rq_data_dir(rq) != rq_data_dir(pos))
|
||||
break;
|
||||
if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER))
|
||||
break;
|
||||
if (blk_rq_pos(rq) >= boundary) {
|
||||
if (blk_rq_pos(pos) < boundary)
|
||||
continue;
|
||||
} else {
|
||||
if (blk_rq_pos(pos) >= boundary)
|
||||
break;
|
||||
}
|
||||
if (blk_rq_pos(rq) >= blk_rq_pos(pos))
|
||||
break;
|
||||
}
|
||||
|
||||
list_add(&rq->queuelist, entry);
|
||||
}
|
||||
EXPORT_SYMBOL(elv_dispatch_sort);
|
||||
|
||||
/*
|
||||
* Insert rq into dispatch queue of q. Queue lock must be held on
|
||||
* entry. rq is added to the back of the dispatch queue. To be used by
|
||||
* specific elevators.
|
||||
*/
|
||||
void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
if (q->last_merge == rq)
|
||||
q->last_merge = NULL;
|
||||
|
||||
elv_rqhash_del(q, rq);
|
||||
|
||||
q->nr_sorted--;
|
||||
|
||||
q->end_sector = rq_end_sector(rq);
|
||||
q->boundary_rq = rq;
|
||||
list_add_tail(&rq->queuelist, &q->queue_head);
|
||||
}
|
||||
EXPORT_SYMBOL(elv_dispatch_add_tail);
|
||||
|
||||
enum elv_merge elv_merge(struct request_queue *q, struct request **req,
|
||||
struct bio *bio)
|
||||
{
|
||||
@ -881,12 +817,6 @@ int elv_register(struct elevator_type *e)
|
||||
list_add_tail(&e->list, &elv_list);
|
||||
spin_unlock(&elv_list_lock);
|
||||
|
||||
/* print pretty message */
|
||||
if (elevator_match(e, chosen_elevator) ||
|
||||
(!*chosen_elevator &&
|
||||
elevator_match(e, CONFIG_DEFAULT_IOSCHED)))
|
||||
def = " (default)";
|
||||
|
||||
printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
|
||||
def);
|
||||
return 0;
|
||||
|
@ -1,124 +0,0 @@
|
||||
/*
|
||||
* elevator noop
|
||||
*/
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/elevator.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/init.h>
|
||||
|
||||
struct noop_data {
|
||||
struct list_head queue;
|
||||
};
|
||||
|
||||
static void noop_merged_requests(struct request_queue *q, struct request *rq,
|
||||
struct request *next)
|
||||
{
|
||||
list_del_init(&next->queuelist);
|
||||
}
|
||||
|
||||
static int noop_dispatch(struct request_queue *q, int force)
|
||||
{
|
||||
struct noop_data *nd = q->elevator->elevator_data;
|
||||
struct request *rq;
|
||||
|
||||
rq = list_first_entry_or_null(&nd->queue, struct request, queuelist);
|
||||
if (rq) {
|
||||
list_del_init(&rq->queuelist);
|
||||
elv_dispatch_sort(q, rq);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void noop_add_request(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
struct noop_data *nd = q->elevator->elevator_data;
|
||||
|
||||
list_add_tail(&rq->queuelist, &nd->queue);
|
||||
}
|
||||
|
||||
static struct request *
|
||||
noop_former_request(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
struct noop_data *nd = q->elevator->elevator_data;
|
||||
|
||||
if (rq->queuelist.prev == &nd->queue)
|
||||
return NULL;
|
||||
return list_prev_entry(rq, queuelist);
|
||||
}
|
||||
|
||||
static struct request *
|
||||
noop_latter_request(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
struct noop_data *nd = q->elevator->elevator_data;
|
||||
|
||||
if (rq->queuelist.next == &nd->queue)
|
||||
return NULL;
|
||||
return list_next_entry(rq, queuelist);
|
||||
}
|
||||
|
||||
static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
|
||||
{
|
||||
struct noop_data *nd;
|
||||
struct elevator_queue *eq;
|
||||
|
||||
eq = elevator_alloc(q, e);
|
||||
if (!eq)
|
||||
return -ENOMEM;
|
||||
|
||||
nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
|
||||
if (!nd) {
|
||||
kobject_put(&eq->kobj);
|
||||
return -ENOMEM;
|
||||
}
|
||||
eq->elevator_data = nd;
|
||||
|
||||
INIT_LIST_HEAD(&nd->queue);
|
||||
|
||||
spin_lock_irq(q->queue_lock);
|
||||
q->elevator = eq;
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void noop_exit_queue(struct elevator_queue *e)
|
||||
{
|
||||
struct noop_data *nd = e->elevator_data;
|
||||
|
||||
BUG_ON(!list_empty(&nd->queue));
|
||||
kfree(nd);
|
||||
}
|
||||
|
||||
static struct elevator_type elevator_noop = {
|
||||
.ops.sq = {
|
||||
.elevator_merge_req_fn = noop_merged_requests,
|
||||
.elevator_dispatch_fn = noop_dispatch,
|
||||
.elevator_add_req_fn = noop_add_request,
|
||||
.elevator_former_req_fn = noop_former_request,
|
||||
.elevator_latter_req_fn = noop_latter_request,
|
||||
.elevator_init_fn = noop_init_queue,
|
||||
.elevator_exit_fn = noop_exit_queue,
|
||||
},
|
||||
.elevator_name = "noop",
|
||||
.elevator_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init noop_init(void)
|
||||
{
|
||||
return elv_register(&elevator_noop);
|
||||
}
|
||||
|
||||
static void __exit noop_exit(void)
|
||||
{
|
||||
elv_unregister(&elevator_noop);
|
||||
}
|
||||
|
||||
module_init(noop_init);
|
||||
module_exit(noop_exit);
|
||||
|
||||
|
||||
MODULE_AUTHOR("Jens Axboe");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("No-op IO scheduler");
|
Loading…
Reference in New Issue
Block a user