net/mlx5: Allocating a pool of MSI-X vectors for SFs

SFs (Sub Functions) currently use IRQs from the global IRQ table their parent Physical Function have. In order to better scale, we need to allocate more IRQs and share them between different SFs. Driver will maintain 3 separated irq pools: 1. A pool that serve the PF consumer (PF's netdev, rdma stacks), similar to what the driver had before this patch. i.e, this pool will share irqs between rdma and netev, and will keep the irq indexes and allocation order. The last is important for PF netdev rmap (aRFS). 2. A pool of control IRQs for SFs. The size of this pool is the number of SFs that can be created divided by SFS_PER_IRQ. This pool will serve the control path EQs of the SFs. 3. A pool of completion data path IRQs for SFs transport queues. The size of this pool is: num_irqs_allocated - pf_pool_size - sf_ctrl_pool_size. This pool will served netdev and rdma stacks. Moreover, rmap is not supported on SFs. Sharing methodology of the SFs pools is explained in the next patch. Important note: rmap is not supported on SFs because rmap mapping cannot function correctly for IRQs that are shared for different core/netdev RX rings. Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Leon Romanovsky <leonro@nvidia.com> Reviewed-by: Tariq Toukan <tariqt@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2021-02-23 11:48:17 +02:00 · 2021-02-23 11:48:17 +02:00 · 71e084e264
commit 71e084e264
parent fc63dd2a85
3 changed files with 240 additions and 132 deletions
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@ -471,14 +471,7 @@ static int create_async_eq(struct mlx5_core_dev *dev,
 	int err;

 	mutex_lock(&eq_table->lock);
-	/* Async EQs must share irq index 0 */
-	if (param->irq_index != 0) {
-		err = -EINVAL;
-		goto unlock;
-	}
-
 	err = create_map_eq(dev, eq, param);
-unlock:
 	mutex_unlock(&eq_table->lock);
 	return err;
 }
@ -996,8 +989,11 @@ int mlx5_eq_table_create(struct mlx5_core_dev *dev)

 	eq_table->num_comp_eqs =
 		min_t(int,
-		      mlx5_irq_get_num_comp(eq_table->irq_table),
+		      mlx5_irq_table_get_num_comp(eq_table->irq_table),
 		      num_eqs - MLX5_MAX_ASYNC_EQS);
+	if (mlx5_core_is_sf(dev))
+		eq_table->num_comp_eqs = min_t(int, eq_table->num_comp_eqs,
+					       MLX5_COMP_EQS_PER_SF);

 	err = create_async_eqs(dev);
 	if (err) {
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
@ -6,13 +6,17 @@

 #include <linux/mlx5/driver.h>

+#define MLX5_COMP_EQS_PER_SF 8
+
+#define MLX5_IRQ_EQ_CTRL (0)
+
 struct mlx5_irq;

 int mlx5_irq_table_init(struct mlx5_core_dev *dev);
 void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev);
 int mlx5_irq_table_create(struct mlx5_core_dev *dev);
 void mlx5_irq_table_destroy(struct mlx5_core_dev *dev);
-int mlx5_irq_get_num_comp(struct mlx5_irq_table *table);
+int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table);
 struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev);

 int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int devfn,
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@ -7,11 +7,19 @@
 #include <linux/mlx5/driver.h>
 #include "mlx5_core.h"
 #include "mlx5_irq.h"
+#include "sf/sf.h"
 #ifdef CONFIG_RFS_ACCEL
 #include <linux/cpu_rmap.h>
 #endif

 #define MLX5_MAX_IRQ_NAME (32)
+/* max irq_index is 255. three chars */
+#define MLX5_MAX_IRQ_IDX_CHARS (3)
+
+#define MLX5_SFS_PER_CTRL_IRQ 64
+#define MLX5_IRQ_CTRL_SF_MAX 8
+/* min num of vectores for SFs to be enabled */
+#define MLX5_IRQ_VEC_COMP_BASE_SF 2

 struct mlx5_irq {
 	u32 index;
@ -20,42 +28,22 @@ struct mlx5_irq {
 	char name[MLX5_MAX_IRQ_NAME];
 	struct kref kref;
 	int irqn;
-	struct mlx5_irq_table *table;
+	struct mlx5_irq_pool *pool;
+};
+
+struct mlx5_irq_pool {
+	char name[MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS];
+	struct xa_limit xa_num_irqs;
+	struct xarray irqs;
+	struct mlx5_core_dev *dev;
 };

 struct mlx5_irq_table {
-	struct xarray irqs;
-	int nvec;
+	struct mlx5_irq_pool *pf_pool;
+	struct mlx5_irq_pool *sf_ctrl_pool;
+	struct mlx5_irq_pool *sf_comp_pool;
 };

-int mlx5_irq_table_init(struct mlx5_core_dev *dev)
-{
-	struct mlx5_irq_table *irq_table;
-
-	if (mlx5_core_is_sf(dev))
-		return 0;
-
-	irq_table = kvzalloc(sizeof(*irq_table), GFP_KERNEL);
-	if (!irq_table)
-		return -ENOMEM;
-
-	dev->priv.irq_table = irq_table;
-	return 0;
-}
-
-void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)
-{
-	if (mlx5_core_is_sf(dev))
-		return;
-
-	kvfree(dev->priv.irq_table);
-}
-
-int mlx5_irq_get_num_comp(struct mlx5_irq_table *table)
-{
-	return table->nvec - MLX5_IRQ_VEC_COMP_BASE;
-}
-
 /**
 * mlx5_get_default_msix_vec_count - Get the default number of MSI-X vectors
 *                                   to be ssigned to each VF.
@ -144,9 +132,9 @@ out:
 static void irq_release(struct kref *kref)
 {
 	struct mlx5_irq *irq = container_of(kref, struct mlx5_irq, kref);
-	struct mlx5_irq_table *table =  irq->table;
+	struct mlx5_irq_pool *pool = irq->pool;

-	xa_erase(&table->irqs, irq->index);
+	xa_erase(&pool->irqs, irq->index);
 	/* free_irq requires that affinity and rmap will be cleared
 	 * before calling it. This is why there is asymmetry with set_rmap
 	 * which should be called after alloc_irq but before request_irq.
@ -162,6 +150,76 @@ static void irq_put(struct mlx5_irq *irq)
 	kref_put(&irq->kref, irq_release);
 }

+static irqreturn_t irq_int_handler(int irq, void *nh)
+{
+	atomic_notifier_call_chain(nh, 0, NULL);
+	return IRQ_HANDLED;
+}
+
+static void irq_sf_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
+{
+	snprintf(name, MLX5_MAX_IRQ_NAME, "%s%d", pool->name, vecidx);
+}
+
+static void irq_set_name(char *name, int vecidx)
+{
+	if (vecidx == 0) {
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async%d", vecidx);
+		return;
+	}
+
+	snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d",
+		 vecidx - MLX5_IRQ_VEC_COMP_BASE);
+}
+
+static struct mlx5_irq *irq_request(struct mlx5_irq_pool *pool, int i)
+{
+	struct mlx5_core_dev *dev = pool->dev;
+	char name[MLX5_MAX_IRQ_NAME];
+	struct mlx5_irq *irq;
+	int err;
+
+	irq = kzalloc(sizeof(*irq), GFP_KERNEL);
+	if (!irq)
+		return ERR_PTR(-ENOMEM);
+	irq->irqn = pci_irq_vector(dev->pdev, i);
+	if (!pool->name[0])
+		irq_set_name(name, i);
+	else
+		irq_sf_set_name(pool, name, i);
+	ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh);
+	snprintf(irq->name, MLX5_MAX_IRQ_NAME,
+		 "%s@pci:%s", name, pci_name(dev->pdev));
+	err = request_irq(irq->irqn, irq_int_handler, 0, irq->name,
+			  &irq->nh);
+	if (err) {
+		mlx5_core_err(dev, "Failed to request irq. err = %d\n", err);
+		goto err_req_irq;
+	}
+	if (!zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) {
+		mlx5_core_warn(dev, "zalloc_cpumask_var failed\n");
+		err = -ENOMEM;
+		goto err_cpumask;
+	}
+	err = xa_alloc(&pool->irqs, &irq->index, irq, pool->xa_num_irqs,
+		       GFP_KERNEL);
+	if (err) {
+		mlx5_core_err(dev, "Failed to alloc xa entry for irq(%u). err = %d\n",
+			      irq->index, err);
+		goto err_xa;
+	}
+	irq->pool = pool;
+	kref_init(&irq->kref);
+	return irq;
+err_xa:
+	free_cpumask_var(irq->mask);
+err_cpumask:
+	free_irq(irq->irqn, &irq->nh);
+err_req_irq:
+	kfree(irq);
+	return ERR_PTR(err);
+}
+
 int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
 {
 	int err;
@ -184,69 +242,9 @@ int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
 	return atomic_notifier_chain_unregister(&irq->nh, nb);
 }

-static irqreturn_t irq_int_handler(int irq, void *nh)
+struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq)
 {
-	atomic_notifier_call_chain(nh, 0, NULL);
-	return IRQ_HANDLED;
-}
-
-static void irq_set_name(char *name, int vecidx)
-{
-	if (!vecidx) {
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async");
-		return;
-	}
-
-	snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d",
-		 vecidx - MLX5_IRQ_VEC_COMP_BASE);
-}
-
-static struct mlx5_irq *irq_request(struct mlx5_core_dev *dev, int i)
-{
-	struct mlx5_irq_table *table = mlx5_irq_table_get(dev);
-	char name[MLX5_MAX_IRQ_NAME];
-	struct xa_limit xa_num_irqs;
-	struct mlx5_irq *irq;
-	int err;
-
-	irq = kzalloc(sizeof(*irq), GFP_KERNEL);
-	if (!irq)
-		return ERR_PTR(-ENOMEM);
-	irq->irqn = pci_irq_vector(dev->pdev, i);
-	irq_set_name(name, i);
-	ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh);
-	snprintf(irq->name, MLX5_MAX_IRQ_NAME,
-		 "%s@pci:%s", name, pci_name(dev->pdev));
-	err = request_irq(irq->irqn, irq_int_handler, 0, irq->name,
-			  &irq->nh);
-	if (err) {
-		mlx5_core_err(dev, "Failed to request irq. err = %d\n", err);
-		goto err_req_irq;
-	}
-	if (!zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) {
-		mlx5_core_warn(dev, "zalloc_cpumask_var failed\n");
-		err = -ENOMEM;
-		goto err_cpumask;
-	}
-	xa_num_irqs.min = 0;
-	xa_num_irqs.max = table->nvec;
-	err = xa_alloc(&table->irqs, &irq->index, irq, xa_num_irqs,
-		       GFP_KERNEL);
-	if (err) {
-		mlx5_core_err(dev, "Failed to alloc xa entry for irq(%u). err = %d\n",
-			      irq->index, err);
-		goto err_xa;
-	}
-	irq->table = table;
-	kref_init(&irq->kref);
-	return irq;
-err_xa:
-	free_cpumask_var(irq->mask);
-err_cpumask:
-	free_irq(irq->irqn, &irq->nh);
-err_req_irq:
-	kfree(irq);
-	return ERR_PTR(err);
+	return irq->mask;
 }

 /**
@ -272,14 +270,17 @@ struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, int vecidx,
 				  struct cpumask *affinity)
 {
 	struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
+	struct mlx5_irq_pool *pool;
 	struct mlx5_irq *irq;

-	irq = xa_load(&irq_table->irqs, vecidx);
+	pool = irq_table->pf_pool;
+
+	irq = xa_load(&pool->irqs, vecidx);
 	if (irq) {
 		kref_get(&irq->kref);
 		return irq;
 	}
-	irq = irq_request(dev, vecidx);
+	irq = irq_request(pool, vecidx);
 	if (IS_ERR(irq))
 		return irq;
 	cpumask_copy(irq->mask, affinity);
@ -287,53 +288,162 @@ struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, int vecidx,
 	return irq;
 }

-struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq)
+/* irq_pool API */
+
+static struct mlx5_irq_pool *
+irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name)
 {
-	return irq->mask;
+	struct mlx5_irq_pool *pool = kvzalloc(sizeof(*pool), GFP_KERNEL);
+
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+	pool->dev = dev;
+	xa_init_flags(&pool->irqs, XA_FLAGS_ALLOC);
+	pool->xa_num_irqs.min = start;
+	pool->xa_num_irqs.max = start + size - 1;
+	if (name)
+		snprintf(pool->name, MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS,
+			 name);
+	mlx5_core_dbg(dev, "pool->name = %s, pool->size = %d, pool->start = %d",
+		      name, size, start);
+	return pool;
+}
+
+static void irq_pool_free(struct mlx5_irq_pool *pool)
+{
+	struct mlx5_irq *irq;
+	unsigned long index;
+
+	xa_for_each(&pool->irqs, index, irq)
+		irq_release(&irq->kref);
+	xa_destroy(&pool->irqs);
+	kvfree(pool);
+}
+
+static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pf_vec)
+{
+	struct mlx5_irq_table *table = dev->priv.irq_table;
+	int num_sf_ctrl_by_msix;
+	int num_sf_ctrl_by_sfs;
+	int num_sf_ctrl;
+	int err;
+
+	/* init pf_pool */
+	table->pf_pool = irq_pool_alloc(dev, 0, pf_vec, NULL);
+	if (IS_ERR(table->pf_pool))
+		return PTR_ERR(table->pf_pool);
+	if (!mlx5_sf_max_functions(dev))
+		return 0;
+	if (sf_vec < MLX5_IRQ_VEC_COMP_BASE_SF) {
+		mlx5_core_err(dev, "Not enught IRQs for SFs. SF may run at lower performance\n");
+		return 0;
+	}
+
+	/* init sf_ctrl_pool */
+	num_sf_ctrl_by_msix = DIV_ROUND_UP(sf_vec, MLX5_COMP_EQS_PER_SF);
+	num_sf_ctrl_by_sfs = DIV_ROUND_UP(mlx5_sf_max_functions(dev),
+					  MLX5_SFS_PER_CTRL_IRQ);
+	num_sf_ctrl = min_t(int, num_sf_ctrl_by_msix, num_sf_ctrl_by_sfs);
+	num_sf_ctrl = min_t(int, MLX5_IRQ_CTRL_SF_MAX, num_sf_ctrl);
+	table->sf_ctrl_pool = irq_pool_alloc(dev, pf_vec, num_sf_ctrl,
+					     "mlx5_sf_ctrl");
+	if (IS_ERR(table->sf_ctrl_pool)) {
+		err = PTR_ERR(table->sf_ctrl_pool);
+		goto err_pf;
+	}
+	/* init sf_comp_pool */
+	table->sf_comp_pool = irq_pool_alloc(dev, pf_vec + num_sf_ctrl,
+					     sf_vec - num_sf_ctrl, "mlx5_sf_comp");
+	if (IS_ERR(table->sf_comp_pool)) {
+		err = PTR_ERR(table->sf_comp_pool);
+		goto err_sf_ctrl;
+	}
+	return 0;
+err_sf_ctrl:
+	irq_pool_free(table->sf_ctrl_pool);
+err_pf:
+	irq_pool_free(table->pf_pool);
+	return err;
+}
+
+static void irq_pools_destroy(struct mlx5_irq_table *table)
+{
+	if (table->sf_ctrl_pool) {
+		irq_pool_free(table->sf_comp_pool);
+		irq_pool_free(table->sf_ctrl_pool);
+	}
+	irq_pool_free(table->pf_pool);
+}
+
+/* irq_table API */
+
+int mlx5_irq_table_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_irq_table *irq_table;
+
+	if (mlx5_core_is_sf(dev))
+		return 0;
+
+	irq_table = kvzalloc(sizeof(*irq_table), GFP_KERNEL);
+	if (!irq_table)
+		return -ENOMEM;
+
+	dev->priv.irq_table = irq_table;
+	return 0;
+}
+
+void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)
+{
+	if (mlx5_core_is_sf(dev))
+		return;
+
+	kvfree(dev->priv.irq_table);
+}
+
+int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table)
+{
+	return table->pf_pool->xa_num_irqs.max - table->pf_pool->xa_num_irqs.min;
 }

 int mlx5_irq_table_create(struct mlx5_core_dev *dev)
 {
-	struct mlx5_priv *priv = &dev->priv;
-	struct mlx5_irq_table *table = priv->irq_table;
 	int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
 		      MLX5_CAP_GEN(dev, max_num_eqs) :
 		      1 << MLX5_CAP_GEN(dev, log_max_eq);
-	int nvec;
+	int total_vec;
+	int pf_vec;
 	int err;

 	if (mlx5_core_is_sf(dev))
 		return 0;

-	nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
+	pf_vec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
 		 MLX5_IRQ_VEC_COMP_BASE;
-	nvec = min_t(int, nvec, num_eqs);
-	if (nvec <= MLX5_IRQ_VEC_COMP_BASE)
+	pf_vec = min_t(int, pf_vec, num_eqs);
+	if (pf_vec <= MLX5_IRQ_VEC_COMP_BASE)
 		return -ENOMEM;

-	xa_init_flags(&table->irqs, XA_FLAGS_ALLOC);
+	total_vec = pf_vec;
+	if (mlx5_sf_max_functions(dev))
+		total_vec += MLX5_IRQ_CTRL_SF_MAX +
+			MLX5_COMP_EQS_PER_SF * mlx5_sf_max_functions(dev);

-	nvec = pci_alloc_irq_vectors(dev->pdev, MLX5_IRQ_VEC_COMP_BASE + 1,
-				     nvec, PCI_IRQ_MSIX);
-	if (nvec < 0) {
-		err = nvec;
-		goto err_free_irq;
-	}
+	total_vec = pci_alloc_irq_vectors(dev->pdev, MLX5_IRQ_VEC_COMP_BASE + 1,
+					  total_vec, PCI_IRQ_MSIX);
+	if (total_vec < 0)
+		return total_vec;
+	pf_vec = min(pf_vec, total_vec);

-	table->nvec = nvec;
+	err = irq_pools_init(dev, total_vec - pf_vec, pf_vec);
+	if (err)
+		pci_free_irq_vectors(dev->pdev);

-	return 0;
-
-err_free_irq:
-	xa_destroy(&table->irqs);
 	return err;
 }

 void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
 {
 	struct mlx5_irq_table *table = dev->priv.irq_table;
-	struct mlx5_irq *irq;
-	unsigned long index;

 	if (mlx5_core_is_sf(dev))
 		return;
@ -341,10 +451,8 @@ void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
 	/* There are cases where IRQs still will be in used when we reaching
 	 * to here. Hence, making sure all the irqs are realeased.
 	 */
-	xa_for_each(&table->irqs, index, irq)
-		irq_release(&irq->kref);
+	irq_pools_destroy(table);
 	pci_free_irq_vectors(dev->pdev);
-	xa_destroy(&table->irqs);
 }

 struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev)