This patch performs sequential mapping between CPUs and queues. In case the system has more CPUs than HWQs then there are still CPUs to map to HWQs. In hyperthreaded system, map the unmapped CPUs and their siblings to the same HWQ. This actually fixes a bug that found unmapped HWQs in a system with 2 sockets, 18 cores per socket, 2 threads per core (total 72 CPUs) running NVMEoF (opens upto maximum of 64 HWQs). Performance results running fio (72 jobs, 128 iodepth) using null_blk (w/w.o patch): bs IOPS(read submit_queues=72) IOPS(write submit_queues=72) IOPS(read submit_queues=24) IOPS(write submit_queues=24) ----- ---------------------------- ------------------------------ ---------------------------- ----------------------------- 512 4890.4K/4723.5K 4524.7K/4324.2K 4280.2K/4264.3K 3902.4K/3909.5K 1k 4910.1K/4715.2K 4535.8K/4309.6K 4296.7K/4269.1K 3906.8K/3914.9K 2k 4906.3K/4739.7K 4526.7K/4330.6K 4301.1K/4262.4K 3890.8K/3900.1K 4k 4918.6K/4730.7K 4556.1K/4343.6K 4297.6K/4264.5K 3886.9K/3893.9K 8k 4906.4K/4748.9K 4550.9K/4346.7K 4283.2K/4268.8K 3863.4K/3858.2K 16k 4903.8K/4782.6K 4501.5K/4233.9K 4292.3K/4282.3K 3773.1K/3773.5K 32k 4885.8K/4782.4K 4365.9K/4184.2K 4307.5K/4289.4K 3780.3K/3687.3K 64k 4822.5K/4762.7K 2752.8K/2675.1K 4308.8K/4312.3K 2651.5K/2655.7K 128k 2388.5K/2313.8K 1391.9K/1375.7K 2142.8K/2152.2K 1395.5K/1374.2K Signed-off-by: Max Gurtovoy <maxg@mellanox.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk>
83 lines
1.9 KiB
C
83 lines
1.9 KiB
C
/*
|
|
* CPU <-> hardware queue mapping helpers
|
|
*
|
|
* Copyright (C) 2013-2014 Jens Axboe
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/threads.h>
|
|
#include <linux/module.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/smp.h>
|
|
#include <linux/cpu.h>
|
|
|
|
#include <linux/blk-mq.h>
|
|
#include "blk.h"
|
|
#include "blk-mq.h"
|
|
|
|
static int cpu_to_queue_index(unsigned int nr_queues, const int cpu,
|
|
const struct cpumask *online_mask)
|
|
{
|
|
/*
|
|
* Non online CPU will be mapped to queue index 0.
|
|
*/
|
|
if (!cpumask_test_cpu(cpu, online_mask))
|
|
return 0;
|
|
return cpu % nr_queues;
|
|
}
|
|
|
|
static int get_first_sibling(unsigned int cpu)
|
|
{
|
|
unsigned int ret;
|
|
|
|
ret = cpumask_first(topology_sibling_cpumask(cpu));
|
|
if (ret < nr_cpu_ids)
|
|
return ret;
|
|
|
|
return cpu;
|
|
}
|
|
|
|
int blk_mq_map_queues(struct blk_mq_tag_set *set)
|
|
{
|
|
unsigned int *map = set->mq_map;
|
|
unsigned int nr_queues = set->nr_hw_queues;
|
|
const struct cpumask *online_mask = cpu_online_mask;
|
|
unsigned int cpu, first_sibling;
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
/*
|
|
* First do sequential mapping between CPUs and queues.
|
|
* In case we still have CPUs to map, and we have some number of
|
|
* threads per cores then map sibling threads to the same queue for
|
|
* performace optimizations.
|
|
*/
|
|
if (cpu < nr_queues) {
|
|
map[cpu] = cpu_to_queue_index(nr_queues, cpu, online_mask);
|
|
} else {
|
|
first_sibling = get_first_sibling(cpu);
|
|
if (first_sibling == cpu)
|
|
map[cpu] = cpu_to_queue_index(nr_queues, cpu, online_mask);
|
|
else
|
|
map[cpu] = map[first_sibling];
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_mq_map_queues);
|
|
|
|
/*
|
|
* We have no quick way of doing reverse lookups. This is only used at
|
|
* queue init time, so runtime isn't important.
|
|
*/
|
|
int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
|
|
{
|
|
int i;
|
|
|
|
for_each_possible_cpu(i) {
|
|
if (index == mq_map[i])
|
|
return local_memory_node(cpu_to_node(i));
|
|
}
|
|
|
|
return NUMA_NO_NODE;
|
|
}
|