virtio-fs: add multi-queue support

This commit creates a multi-queue mapping at device bring-up. The driver first attempts to use the existing MSI-X interrupt affinities (previously disabled), and if not present, will distribute the request queues evenly over the CPUs. If the latter fails as well, all CPUs are mapped to request queue zero. When a request is handed from FUSE to the virtio-fs device driver, the driver will use the current CPU to index into the multi-queue mapping and determine the optimal request queue to use. We measured the performance of this patch with the fio benchmarking tool, increasing the number of queues results in a significant speedup for both read and write operations, demonstrating the effectiveness of multi-queue support. Host: - Dell PowerEdge R760 - CPU: Intel(R) Xeon(R) Gold 6438M, 128 cores - VM: KVM with 32 cores Virtio-fs device: - BlueField-3 DPU - CPU: ARM Cortex-A78AE, 16 cores - One thread per queue, each busy polling on one request queue - Each queue is 1024 descriptors deep Workload: - fio, sequential read or write, ioengine=libaio, numjobs=32, 4GiB file per job, iodepth=8, bs=256KiB, runtime=30s Performance Results: +===========================+==========+===========+ | Number of queues | Fio read | Fio write | +===========================+==========+===========+ | 1 request queue (GiB/s) | 6.1 | 4.6 | +---------------------------+----------+-----------+ | 8 request queues (GiB/s) | 25.8 | 10.3 | +---------------------------+----------+-----------+ | 16 request queues (GiB/s) | 30.9 | 19.5 | +---------------------------+----------+-----------+ | 32 request queue (GiB/s) | 33.2 | 22.6 | +---------------------------+----------+-----------+ | Speedup | 5.5x | 5x | +---------------=-----------+----------+-----------+ Signed-off-by: Peter-Jan Gootzen <pgootzen@nvidia.com> Signed-off-by: Yoray Zack <yorayz@nvidia.com> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2024-11-10 22:21:40 +00:00 · 2024-05-01 17:38:17 +02:00 · 2024-05-01 17:38:17 +02:00 · 529395d2ae
commit 529395d2ae
parent 103c2de111
1 changed files with 62 additions and 8 deletions
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@ -7,6 +7,8 @@
 #include <linux/fs.h>
 #include <linux/dax.h>
 #include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/group_cpus.h>
 #include <linux/pfn_t.h>
 #include <linux/memremap.h>
 #include <linux/module.h>
@ -67,6 +69,8 @@ struct virtio_fs {
 	unsigned int num_request_queues; /* number of request queues */
 	struct dax_device *dax_dev;

+	unsigned int *mq_map; /* index = cpu id, value = request vq id */
+
 	/* DAX memory window where file contents are mapped */
 	void *window_kaddr;
 	phys_addr_t window_phys_addr;
@ -185,6 +189,7 @@ static void virtio_fs_ktype_release(struct kobject *kobj)
 {
 	struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj);

+	kfree(vfs->mq_map);
 	kfree(vfs->vqs);
 	kfree(vfs);
 }
@ -706,6 +711,44 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
 	}
 }

+static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+	const struct cpumask *mask, *masks;
+	unsigned int q, cpu;
+
+	/* First attempt to map using existing transport layer affinities
+	 * e.g. PCIe MSI-X
+	 */
+	if (!vdev->config->get_vq_affinity)
+		goto fallback;
+
+	for (q = 0; q < fs->num_request_queues; q++) {
+		mask = vdev->config->get_vq_affinity(vdev, VQ_REQUEST + q);
+		if (!mask)
+			goto fallback;
+
+		for_each_cpu(cpu, mask)
+			fs->mq_map[cpu] = q;
+	}
+
+	return;
+fallback:
+	/* Attempt to map evenly in groups over the CPUs */
+	masks = group_cpus_evenly(fs->num_request_queues);
+	/* If even this fails we default to all CPUs use queue zero */
+	if (!masks) {
+		for_each_possible_cpu(cpu)
+			fs->mq_map[cpu] = 0;
+		return;
+	}
+
+	for (q = 0; q < fs->num_request_queues; q++) {
+		for_each_cpu(cpu, &masks[q])
+			fs->mq_map[cpu] = q;
+	}
+	kfree(masks);
+}
+
 /* Virtqueue interrupt handler */
 static void virtio_fs_vq_done(struct virtqueue *vq)
 {
@ -742,6 +785,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
 {
 	struct virtqueue **vqs;
 	vq_callback_t **callbacks;
+	/* Specify pre_vectors to ensure that the queues before the
+	 * request queues (e.g. hiprio) don't claim any of the CPUs in
+	 * the multi-queue mapping and interrupt affinities
+	 */
+	struct irq_affinity desc = { .pre_vectors = VQ_REQUEST };
 	const char **names;
 	unsigned int i;
 	int ret = 0;
@ -763,7 +811,9 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
 	callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
 					GFP_KERNEL);
 	names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
-	if (!vqs || !callbacks || !names) {
+	fs->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*fs->mq_map), GFP_KERNEL,
+					dev_to_node(&vdev->dev));
+	if (!vqs || !callbacks || !names || !fs->mq_map) {
 		ret = -ENOMEM;
 		goto out;
 	}
@ -783,7 +833,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
 		names[i] = fs->vqs[i].name;
 	}

-	ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
+	ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, &desc);
 	if (ret < 0)
 		goto out;

@ -795,8 +845,10 @@ out:
 	kfree(names);
 	kfree(callbacks);
 	kfree(vqs);
-	if (ret)
+	if (ret) {
 		kfree(fs->vqs);
+		kfree(fs->mq_map);
+	}
 	return ret;
 }

@ -942,7 +994,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
 	if (ret < 0)
 		goto out;

-	/* TODO vq affinity */
+	virtio_fs_map_queues(vdev, fs);

 	ret = virtio_fs_setup_dax(vdev, fs);
 	if (ret < 0)
@ -1291,7 +1343,7 @@ out:
 static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
 __releases(fiq->lock)
 {
-	unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
+	unsigned int queue_id;
 	struct virtio_fs *fs;
 	struct fuse_req *req;
 	struct virtio_fs_vq *fsvq;
@ -1305,11 +1357,13 @@ __releases(fiq->lock)
 	spin_unlock(&fiq->lock);

 	fs = fiq->priv;
+	queue_id = VQ_REQUEST + fs->mq_map[raw_smp_processor_id()];

-	pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
-		  __func__, req->in.h.opcode, req->in.h.unique,
+	pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u queue_id %u\n",
+		 __func__, req->in.h.opcode, req->in.h.unique,
 		 req->in.h.nodeid, req->in.h.len,
-		 fuse_len_args(req->args->out_numargs, req->args->out_args));
+		 fuse_len_args(req->args->out_numargs, req->args->out_args),
+		 queue_id);

 	fsvq = &fs->vqs[queue_id];
 	ret = virtio_fs_enqueue_req(fsvq, req, false);