From d6a488f21c0f3c44bfbb2339a75159ee55aa2b6f Mon Sep 17 00:00:00 2001
From: Devesh Sharma <devesh.sharma@emulex.com>
Date: Mon, 9 Jun 2014 10:52:37 +0530
Subject: [PATCH 01/26] RDMA/ocrdma: Report correct value of
 max_fast_reg_page_list_len

Fix ocrdma_query_device() to report correct value of max_fast_reg_page_list_len.

Signed-off-by: Devesh Sharma <devesh.sharma@emulex.com>
Signed-off-by: Selvin Xavier <selvin.xavier@emulex.com>
Signed-off-by: Mitesh Ahuja <mitesh.ahuja@emulex.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index acb434d16903..4527311171c5 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -101,7 +101,7 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr)
 	attr->max_srq_sge = dev->attr.max_srq_sge;
 	attr->max_srq_wr = dev->attr.max_rqe;
 	attr->local_ca_ack_delay = dev->attr.local_ca_ack_delay;
-	attr->max_fast_reg_page_list_len = 0;
+	attr->max_fast_reg_page_list_len = dev->attr.max_pages_per_frmr;
 	attr->max_pkeys = 1;
 	return 0;
 }

From f93439e476d012b2503dbb07fe0fc675bcbff099 Mon Sep 17 00:00:00 2001
From: Devesh Sharma <devesh.sharma@emulex.com>
Date: Mon, 9 Jun 2014 10:52:38 +0530
Subject: [PATCH 02/26] RDMA/ocrdma: Do not skip setting deferred_arm

When ib_request_notify_cq() is called for the first time, ocrdma tries
to skip setting deffered_arm flag. This may lead CQ to an un-armed
state thus never generating a CQ event and leaving consumer hung.

This patch removes the part of code that skips setting deferred_arm.

Signed-off-by: Devesh Sharma <devesh.sharma@emulex.com>
Signed-off-by: Selvin Xavier <selvin.xavier@emulex.com>
Signed-off-by: Mitesh Ahuja <mitesh.ahuja@emulex.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 4527311171c5..8f5f2577f288 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -2846,11 +2846,9 @@ int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags)
 	if (cq->first_arm) {
 		ocrdma_ring_cq_db(dev, cq_id, arm_needed, sol_needed, 0);
 		cq->first_arm = false;
-		goto skip_defer;
 	}
-	cq->deferred_arm = true;
 
-skip_defer:
+	cq->deferred_arm = true;
 	cq->deferred_sol = sol_needed;
 	spin_unlock_irqrestore(&cq->cq_lock, flags);
 

From 87773dd56d5405ac28119fcfadacefd35877c18f Mon Sep 17 00:00:00 2001
From: Shawn Bohrer <sbohrer@rgmadvisors.com>
Date: Wed, 3 Sep 2014 12:13:57 -0500
Subject: [PATCH 03/26] IB: ib_umem_release() should decrement mm->pinned_vm
 from ib_umem_get

In debugging an application that receives -ENOMEM from ib_reg_mr(), I
found that ib_umem_get() can fail because the pinned_vm count has
wrapped causing it to always be larger than the lock limit even with
RLIMIT_MEMLOCK set to RLIM_INFINITY.

The wrapping of pinned_vm occurs because the process that calls
ib_reg_mr() will have its mm->pinned_vm count incremented.  Later a
different process with a different mm_struct than the one that
allocated the ib_umem struct ends up releasing it which results in
decrementing the new processes mm->pinned_vm count past zero and
wrapping.

I'm not entirely sure what circumstances cause a different process to
release the ib_umem than the one that allocated it but the kernel
stack trace of the freeing process from my situation looks like the
following:

    Call Trace:
     [<ffffffff814d64b1>] dump_stack+0x19/0x1b
     [<ffffffffa0b522a5>] ib_umem_release+0x1f5/0x200 [ib_core]
     [<ffffffffa0b90681>] mlx4_ib_destroy_qp+0x241/0x440 [mlx4_ib]
     [<ffffffffa0b4d93c>] ib_destroy_qp+0x12c/0x170 [ib_core]
     [<ffffffffa0cc7129>] ib_uverbs_close+0x259/0x4e0 [ib_uverbs]
     [<ffffffff81141cba>] __fput+0xba/0x240
     [<ffffffff81141e4e>] ____fput+0xe/0x10
     [<ffffffff81060894>] task_work_run+0xc4/0xe0
     [<ffffffff810029e5>] do_notify_resume+0x95/0xa0
     [<ffffffff814e3dd0>] int_signal+0x12/0x17

The following patch fixes the issue by storing the pid struct of the
process that calls ib_umem_get() so that ib_umem_release and/or
ib_umem_account() can properly decrement the pinned_vm count of the
correct mm_struct.

Signed-off-by: Shawn Bohrer <sbohrer@rgmadvisors.com>
Reviewed-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/core/umem.c | 19 +++++++++++++------
 include/rdma/ib_umem.h         |  1 +
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index a3a2e9c1639b..df0c4f605a21 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -105,6 +105,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	umem->length    = size;
 	umem->offset    = addr & ~PAGE_MASK;
 	umem->page_size = PAGE_SIZE;
+	umem->pid       = get_task_pid(current, PIDTYPE_PID);
 	/*
 	 * We ask for writable memory if any access flags other than
 	 * "remote read" are set.  "Local write" and "remote write"
@@ -198,6 +199,7 @@ out:
 	if (ret < 0) {
 		if (need_release)
 			__ib_umem_release(context->device, umem, 0);
+		put_pid(umem->pid);
 		kfree(umem);
 	} else
 		current->mm->pinned_vm = locked;
@@ -230,15 +232,19 @@ void ib_umem_release(struct ib_umem *umem)
 {
 	struct ib_ucontext *context = umem->context;
 	struct mm_struct *mm;
+	struct task_struct *task;
 	unsigned long diff;
 
 	__ib_umem_release(umem->context->device, umem, 1);
 
-	mm = get_task_mm(current);
-	if (!mm) {
-		kfree(umem);
-		return;
-	}
+	task = get_pid_task(umem->pid, PIDTYPE_PID);
+	put_pid(umem->pid);
+	if (!task)
+		goto out;
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
 
 	diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
 
@@ -262,9 +268,10 @@ void ib_umem_release(struct ib_umem *umem)
 	} else
 		down_write(&mm->mmap_sem);
 
-	current->mm->pinned_vm -= diff;
+	mm->pinned_vm -= diff;
 	up_write(&mm->mmap_sem);
 	mmput(mm);
+out:
 	kfree(umem);
 }
 EXPORT_SYMBOL(ib_umem_release);
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 1ea0b65c4cfb..a2bf41e0bde9 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -47,6 +47,7 @@ struct ib_umem {
 	int                     writable;
 	int                     hugetlb;
 	struct work_struct	work;
+	struct pid             *pid;
 	struct mm_struct       *mm;
 	unsigned long		diff;
 	struct sg_table sg_head;

From 3033771febebf2c66d5146624746ebc4a3d79314 Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Mon, 15 Sep 2014 11:43:06 -0400
Subject: [PATCH 04/26] IB/ipath: Change get_user_pages() usage to always NULL
 vmas

The static helper routine, __ipath_get_user_pages(), accepts a vma
arg, but current use always passes NULL.

This has caused some confusion associated with the correct use of this
argument, but since the current use case doesn't require the
flexiblity, the best thing to do is to simplfy the code to always pass
NULL to get_user_pages().

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/ipath/ipath_user_pages.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c
index dc66c4506916..1da1252dcdb3 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_pages.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c
@@ -54,7 +54,7 @@ static void __ipath_release_user_pages(struct page **p, size_t num_pages,
 
 /* call with current->mm->mmap_sem held */
 static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
-				  struct page **p, struct vm_area_struct **vma)
+				  struct page **p)
 {
 	unsigned long lock_limit;
 	size_t got;
@@ -74,7 +74,7 @@ static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
 		ret = get_user_pages(current, current->mm,
 				     start_page + got * PAGE_SIZE,
 				     num_pages - got, 1, 1,
-				     p + got, vma);
+				     p + got, NULL);
 		if (ret < 0)
 			goto bail_release;
 	}
@@ -165,7 +165,7 @@ int ipath_get_user_pages(unsigned long start_page, size_t num_pages,
 
 	down_write(&current->mm->mmap_sem);
 
-	ret = __ipath_get_user_pages(start_page, num_pages, p, NULL);
+	ret = __ipath_get_user_pages(start_page, num_pages, p);
 
 	up_write(&current->mm->mmap_sem);
 

From f5c4984e065556b81edf3c9dadfbc8750de690fc Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Mon, 15 Sep 2014 11:42:53 -0400
Subject: [PATCH 05/26] IB/qib: Change get_user_pages() usage to always NULL
 vmas

The static helper routine, __qib_get_user_pages(), accepts a vma arg,
but current use always passes NULL.

This has caused some confusion associated with the correct use of this
argument, but since the current use case doesn't require the
flexiblity, the best thing to do is to simplfy the code to always pass
NULL to get_user_pages().

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/qib/qib_user_pages.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 2bc1d2b96298..74f90b2619f6 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -52,7 +52,7 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages,
  * Call with current->mm->mmap_sem held.
  */
 static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
-				struct page **p, struct vm_area_struct **vma)
+				struct page **p)
 {
 	unsigned long lock_limit;
 	size_t got;
@@ -69,7 +69,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
 		ret = get_user_pages(current, current->mm,
 				     start_page + got * PAGE_SIZE,
 				     num_pages - got, 1, 1,
-				     p + got, vma);
+				     p + got, NULL);
 		if (ret < 0)
 			goto bail_release;
 	}
@@ -136,7 +136,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages,
 
 	down_write(&current->mm->mmap_sem);
 
-	ret = __qib_get_user_pages(start_page, num_pages, p, NULL);
+	ret = __qib_get_user_pages(start_page, num_pages, p);
 
 	up_write(&current->mm->mmap_sem);
 

From 50e2ec9105fe1045298924a6c7b9ce870a0b6c13 Mon Sep 17 00:00:00 2001
From: Markus Stockhausen <stockhausen@collogia.de>
Date: Wed, 13 Aug 2014 14:07:30 +0000
Subject: [PATCH 06/26] IB/mlx4: Disable TSO for Connect-X rev. A0 HCAs

According to <http://marc.info/?t=138347640900004&r=1&w=2>, revision
A0 of Connect-X does not correctly assemble TSO packets.  Disable that
feature on that hardware revision.

Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index e1e558a3d692..20f731f08c7e 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -59,6 +59,7 @@
 
 #define MLX4_IB_FLOW_MAX_PRIO 0xFFF
 #define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF
+#define MLX4_IB_CARD_REV_A0   0xA0
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
@@ -158,7 +159,9 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 		props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
 		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
-	if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH)
+	if (dev->dev->caps.max_gso_sz &&
+	    (dev->dev->rev_id != MLX4_IB_CARD_REV_A0) &&
+	    (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH))
 		props->device_cap_flags |= IB_DEVICE_UD_TSO;
 	if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY)
 		props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;

From 68f9d83c7765b110de54677573e9538f3ca6ca04 Mon Sep 17 00:00:00 2001
From: Alex Estrin <alex.estrin@intel.com>
Date: Wed, 20 Aug 2014 11:15:08 -0400
Subject: [PATCH 07/26] IPoIB: Remove unnecessary port query

There are two queries for port attributes one after another. A second
call is not needed since port_attr structure already holds the data.

Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Alex Estrin <alex.estrin@intel.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index d4e005720d01..ffb83b5f7e80 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -529,21 +529,13 @@ void ipoib_mcast_join_task(struct work_struct *work)
 			  port_attr.state);
 		return;
 	}
+	priv->local_lid = port_attr.lid;
 
 	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
 		ipoib_warn(priv, "ib_query_gid() failed\n");
 	else
 		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
 
-	{
-		struct ib_port_attr attr;
-
-		if (!ib_query_port(priv->ca, priv->port, &attr))
-			priv->local_lid = attr.lid;
-		else
-			ipoib_warn(priv, "ib_query_port failed\n");
-	}
-
 	if (!priv->broadcast) {
 		struct ipoib_mcast *broadcast;
 

From 85cbb7c728bf39c45a9789b88c9471c0d7a58b0e Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Fri, 19 Sep 2014 08:32:19 -0400
Subject: [PATCH 08/26] IB/qib: Correct reference counting in debugfs qp_stats

This particular reference count is not needed with the rcu protection,
and the current code leaks a reference count, causing a hang in
qib_qp_destroy().

Cc: <stable@vger.kernel.org>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/qib/qib_debugfs.c | 3 ++-
 drivers/infiniband/hw/qib/qib_qp.c      | 8 --------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/hw/qib/qib_debugfs.c b/drivers/infiniband/hw/qib/qib_debugfs.c
index 799a0c3bffc4..6abd3ed3cd51 100644
--- a/drivers/infiniband/hw/qib/qib_debugfs.c
+++ b/drivers/infiniband/hw/qib/qib_debugfs.c
@@ -193,6 +193,7 @@ static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
 	struct qib_qp_iter *iter;
 	loff_t n = *pos;
 
+	rcu_read_lock();
 	iter = qib_qp_iter_init(s->private);
 	if (!iter)
 		return NULL;
@@ -224,7 +225,7 @@ static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
 
 static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
 {
-	/* nothing for now */
+	rcu_read_unlock();
 }
 
 static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c
index 7fcc150d603c..6ddc0264aad2 100644
--- a/drivers/infiniband/hw/qib/qib_qp.c
+++ b/drivers/infiniband/hw/qib/qib_qp.c
@@ -1325,7 +1325,6 @@ int qib_qp_iter_next(struct qib_qp_iter *iter)
 	struct qib_qp *pqp = iter->qp;
 	struct qib_qp *qp;
 
-	rcu_read_lock();
 	for (; n < dev->qp_table_size; n++) {
 		if (pqp)
 			qp = rcu_dereference(pqp->next);
@@ -1333,18 +1332,11 @@ int qib_qp_iter_next(struct qib_qp_iter *iter)
 			qp = rcu_dereference(dev->qp_table[n]);
 		pqp = qp;
 		if (qp) {
-			if (iter->qp)
-				atomic_dec(&iter->qp->refcount);
-			atomic_inc(&qp->refcount);
-			rcu_read_unlock();
 			iter->qp = qp;
 			iter->n = n;
 			return 0;
 		}
 	}
-	rcu_read_unlock();
-	if (iter->qp)
-		atomic_dec(&iter->qp->refcount);
 	return ret;
 }
 

From 4ff0acca7344c93fd9ed778b4c3ce16d95c594e4 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 11 Sep 2014 13:18:37 +0300
Subject: [PATCH 09/26] mlx4: Correct error flows in rereg_mr

This patch addresses feedback from Sagi Grimberg on the rereg_mr
implementation of mlx4.  The following are fixed:

1. Set the correct pd_flags
2. Make sure we change the iova and size MR fields only after
   successful write and allocation of the MTTs.
3. Make the error checking more robust

Fixes: e630664c8383 ("mlx4_core: Add helper functions to support MR re-registration")
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/mr.c         |  7 ++++--
 drivers/net/ethernet/mellanox/mlx4/mr.c | 33 ++++++++++++++++---------
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 9b0e80e59b08..8f9325cfc85d 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -234,14 +234,13 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
 					0);
 		if (IS_ERR(mmr->umem)) {
 			err = PTR_ERR(mmr->umem);
+			/* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
 			mmr->umem = NULL;
 			goto release_mpt_entry;
 		}
 		n = ib_umem_page_count(mmr->umem);
 		shift = ilog2(mmr->umem->page_size);
 
-		mmr->mmr.iova       = virt_addr;
-		mmr->mmr.size       = length;
 		err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
 					      virt_addr, length, n, shift,
 					      *pmpt_entry);
@@ -249,6 +248,8 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
 			ib_umem_release(mmr->umem);
 			goto release_mpt_entry;
 		}
+		mmr->mmr.iova       = virt_addr;
+		mmr->mmr.size       = length;
 
 		err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem);
 		if (err) {
@@ -262,6 +263,8 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
 	 * return a failure. But dereg_mr will free the resources.
 	 */
 	err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry);
+	if (!err && flags & IB_MR_REREG_ACCESS)
+		mmr->mmr.access = mr_access_flags;
 
 release_mpt_entry:
 	mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
index 7d717eccb7b0..193a6adb5d04 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
@@ -298,6 +298,7 @@ static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox
 			    MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
 }
 
+/* Must protect against concurrent access */
 int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
 		       struct mlx4_mpt_entry ***mpt_entry)
 {
@@ -305,13 +306,10 @@ int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
 	int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
 	struct mlx4_cmd_mailbox *mailbox = NULL;
 
-	/* Make sure that at this point we have single-threaded access only */
-
 	if (mmr->enabled != MLX4_MPT_EN_HW)
 		return -EINVAL;
 
 	err = mlx4_HW2SW_MPT(dev, NULL, key);
-
 	if (err) {
 		mlx4_warn(dev, "HW2SW_MPT failed (%d).", err);
 		mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n");
@@ -333,7 +331,6 @@ int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
 				   0, MLX4_CMD_QUERY_MPT,
 				   MLX4_CMD_TIME_CLASS_B,
 				   MLX4_CMD_WRAPPED);
-
 		if (err)
 			goto free_mailbox;
 
@@ -378,9 +375,10 @@ int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
 		err = mlx4_SW2HW_MPT(dev, mailbox, key);
 	}
 
-	mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK;
-	if (!err)
+	if (!err) {
+		mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK;
 		mmr->enabled = MLX4_MPT_EN_HW;
+	}
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt);
@@ -400,11 +398,12 @@ EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt);
 int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
 			 u32 pdn)
 {
-	u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags);
+	u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags) & ~MLX4_MPT_PD_MASK;
 	/* The wrapper function will put the slave's id here */
 	if (mlx4_is_mfunc(dev))
 		pd_flags &= ~MLX4_MPT_PD_VF_MASK;
-	mpt_entry->pd_flags = cpu_to_be32((pd_flags &  ~MLX4_MPT_PD_MASK) |
+
+	mpt_entry->pd_flags = cpu_to_be32(pd_flags |
 					  (pdn & MLX4_MPT_PD_MASK)
 					  | MLX4_MPT_PD_FLAG_EN_INV);
 	return 0;
@@ -600,14 +599,18 @@ int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
 {
 	int err;
 
-	mpt_entry->start       = cpu_to_be64(mr->iova);
-	mpt_entry->length      = cpu_to_be64(mr->size);
-	mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift);
+	mpt_entry->start       = cpu_to_be64(iova);
+	mpt_entry->length      = cpu_to_be64(size);
+	mpt_entry->entity_size = cpu_to_be32(page_shift);
 
 	err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
 	if (err)
 		return err;
 
+	mpt_entry->pd_flags &= cpu_to_be32(MLX4_MPT_PD_MASK |
+					   MLX4_MPT_PD_FLAG_EN_INV);
+	mpt_entry->flags    &= cpu_to_be32(MLX4_MPT_FLAG_FREE |
+					   MLX4_MPT_FLAG_SW_OWNS);
 	if (mr->mtt.order < 0) {
 		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
 		mpt_entry->mtt_addr = 0;
@@ -617,6 +620,14 @@ int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
 		if (mr->mtt.page_shift == 0)
 			mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
 	}
+	if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) {
+		/* fast register MR in free state */
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
+						   MLX4_MPT_PD_FLAG_RAE);
+	} else {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
+	}
 	mr->enabled = MLX4_MPT_EN_SW;
 
 	return 0;

From 1be528bcb88d0b854dda1d60b31f4f8f7310f034 Mon Sep 17 00:00:00 2001
From: "devesh.sharma@emulex.com" <devesh.sharma@emulex.com>
Date: Fri, 5 Sep 2014 15:09:48 +0530
Subject: [PATCH 10/26] RDMA/ocrdma: Resolve L2 address when creating user AH

Because of IP-based GIDs, userspace AHs must have MAC and VLAN ID
resolved separately.  Presently, user AHs are broken for ocrdma.  This
patch resolves L2 addresses while creating user AH and obtains the
right DMAC and VLAN ID before creating AH.

Signed-off-by: Devesh Sharma <devesh.sharma@emulex.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/ocrdma/ocrdma_ah.c | 41 +++++++++++++++++-------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index 40f8536c10b0..a9f967d47575 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -38,7 +38,7 @@
 #define OCRDMA_VID_PCP_SHIFT	0xD
 
 static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
-				struct ib_ah_attr *attr, int pdid)
+			struct ib_ah_attr *attr, union ib_gid *sgid, int pdid)
 {
 	int status = 0;
 	u16 vlan_tag; bool vlan_enabled = false;
@@ -49,8 +49,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
 	memset(&eth, 0, sizeof(eth));
 	memset(&grh, 0, sizeof(grh));
 
-	ah->sgid_index = attr->grh.sgid_index;
-
+	/* VLAN */
 	vlan_tag = attr->vlan_id;
 	if (!vlan_tag || (vlan_tag > 0xFFF))
 		vlan_tag = dev->pvid;
@@ -65,15 +64,14 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
 		eth.eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE);
 		eth_sz = sizeof(struct ocrdma_eth_basic);
 	}
+	/* MAC */
 	memcpy(&eth.smac[0], &dev->nic_info.mac_addr[0], ETH_ALEN);
-	memcpy(&eth.dmac[0], attr->dmac, ETH_ALEN);
 	status = ocrdma_resolve_dmac(dev, attr, &eth.dmac[0]);
 	if (status)
 		return status;
-	status = ocrdma_query_gid(&dev->ibdev, 1, attr->grh.sgid_index,
-			(union ib_gid *)&grh.sgid[0]);
-	if (status)
-		return status;
+	ah->sgid_index = attr->grh.sgid_index;
+	memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid));
+	memcpy(&grh.dgid[0], attr->grh.dgid.raw, sizeof(attr->grh.dgid.raw));
 
 	grh.tclass_flow = cpu_to_be32((6 << 28) |
 			(attr->grh.traffic_class << 24) |
@@ -81,8 +79,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
 	/* 0x1b is next header value in GRH */
 	grh.pdid_hoplimit = cpu_to_be32((pdid << 16) |
 			(0x1b << 8) | attr->grh.hop_limit);
-
-	memcpy(&grh.dgid[0], attr->grh.dgid.raw, sizeof(attr->grh.dgid.raw));
+	/* Eth HDR */
 	memcpy(&ah->av->eth_hdr, &eth, eth_sz);
 	memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh));
 	if (vlan_enabled)
@@ -98,6 +95,8 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
 	struct ocrdma_ah *ah;
 	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	union ib_gid sgid;
+	u8 zmac[ETH_ALEN];
 
 	if (!(attr->ah_flags & IB_AH_GRH))
 		return ERR_PTR(-EINVAL);
@@ -111,7 +110,27 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
 	status = ocrdma_alloc_av(dev, ah);
 	if (status)
 		goto av_err;
-	status = set_av_attr(dev, ah, attr, pd->id);
+
+	status = ocrdma_query_gid(&dev->ibdev, 1, attr->grh.sgid_index, &sgid);
+	if (status) {
+		pr_err("%s(): Failed to query sgid, status = %d\n",
+		      __func__, status);
+		goto av_conf_err;
+	}
+
+	memset(&zmac, 0, ETH_ALEN);
+	if (pd->uctx &&
+	    memcmp(attr->dmac, &zmac, ETH_ALEN)) {
+		status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid,
+                                        attr->dmac, &attr->vlan_id);
+		if (status) {
+			pr_err("%s(): Failed to resolve dmac from gid." 
+				"status = %d\n", __func__, status);
+			goto av_conf_err;
+		}
+	}
+
+	status = set_av_attr(dev, ah, attr, &sgid, pd->id);
 	if (status)
 		goto av_conf_err;
 

From f0c2c225dfe9dfb668fe72eadabb8a3ec74ca036 Mon Sep 17 00:00:00 2001
From: "devesh.sharma@emulex.com" <devesh.sharma@emulex.com>
Date: Fri, 5 Sep 2014 15:09:49 +0530
Subject: [PATCH 11/26] RDMA/ocrdma: Use right macro in query AH

ocrdma_query_ah() does not use correct macro, and checks the wrong bit
for the validity of address handle in vector table.  Fix this.

Signed-off-by: Devesh Sharma <devesh.sharma@emulex.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/ocrdma/ocrdma_ah.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index a9f967d47575..ac02ce4e8040 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -164,7 +164,7 @@ int ocrdma_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr)
 	struct ocrdma_av *av = ah->av;
 	struct ocrdma_grh *grh;
 	attr->ah_flags |= IB_AH_GRH;
-	if (ah->av->valid & Bit(1)) {
+	if (ah->av->valid & OCRDMA_AV_VALID) {
 		grh = (struct ocrdma_grh *)((u8 *)ah->av +
 				sizeof(struct ocrdma_eth_vlan));
 		attr->sl = be16_to_cpu(av->eth_hdr.vlan_tag) >> 13;

From c33b15f00bbfb9324dc38e5176f576a0f46e0873 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@mellanox.com>
Date: Tue, 2 Sep 2014 17:08:41 +0300
Subject: [PATCH 12/26] IB/iser: Fix RX/TX CQ resource leak on error flow

When failing to allocate TX CQ we already allocated RX CQ, so we need to make
sure we release it. Also, when failing to register notification to the RX CQ
we currently leak both RX and TX CQs of the current index, fix that too.

Signed-off-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/ulp/iser/iser_verbs.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 3ef167f97d6f..3bfec4bbda52 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -73,7 +73,7 @@ static int iser_create_device_ib_res(struct iser_device *device)
 {
 	struct iser_cq_desc *cq_desc;
 	struct ib_device_attr *dev_attr = &device->dev_attr;
-	int ret, i, j;
+	int ret, i;
 
 	ret = ib_query_device(device->ib_device, dev_attr);
 	if (ret) {
@@ -125,16 +125,20 @@ static int iser_create_device_ib_res(struct iser_device *device)
 					  iser_cq_event_callback,
 					  (void *)&cq_desc[i],
 					  ISER_MAX_RX_CQ_LEN, i);
-		if (IS_ERR(device->rx_cq[i]))
+		if (IS_ERR(device->rx_cq[i])) {
+			device->rx_cq[i] = NULL;
 			goto cq_err;
+		}
 
 		device->tx_cq[i] = ib_create_cq(device->ib_device,
 					  NULL, iser_cq_event_callback,
 					  (void *)&cq_desc[i],
 					  ISER_MAX_TX_CQ_LEN, i);
 
-		if (IS_ERR(device->tx_cq[i]))
+		if (IS_ERR(device->tx_cq[i])) {
+			device->tx_cq[i] = NULL;
 			goto cq_err;
+		}
 
 		if (ib_req_notify_cq(device->rx_cq[i], IB_CQ_NEXT_COMP))
 			goto cq_err;
@@ -160,14 +164,14 @@ static int iser_create_device_ib_res(struct iser_device *device)
 handler_err:
 	ib_dereg_mr(device->mr);
 dma_mr_err:
-	for (j = 0; j < device->cqs_used; j++)
-		tasklet_kill(&device->cq_tasklet[j]);
+	for (i = 0; i < device->cqs_used; i++)
+		tasklet_kill(&device->cq_tasklet[i]);
 cq_err:
-	for (j = 0; j < i; j++) {
-		if (device->tx_cq[j])
-			ib_destroy_cq(device->tx_cq[j]);
-		if (device->rx_cq[j])
-			ib_destroy_cq(device->rx_cq[j]);
+	for (i = 0; i < device->cqs_used; i++) {
+		if (device->tx_cq[i])
+			ib_destroy_cq(device->tx_cq[i]);
+		if (device->rx_cq[i])
+			ib_destroy_cq(device->rx_cq[i]);
 	}
 	ib_dealloc_pd(device->pd);
 pd_err:

From 91eb1df39a1fba21bbc28895a84630782cd442ed Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Tue, 2 Sep 2014 17:08:42 +0300
Subject: [PATCH 13/26] IB/iser: Allow bind only when connection state is UP

We need to fail the bind operation if the iser connection state != UP
(started teardown) and this should be done under the state lock.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/ulp/iser/iscsi_iser.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 61ee91d88380..93ce62fe1594 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -344,7 +344,6 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
 		     int is_leading)
 {
 	struct iscsi_conn *conn = cls_conn->dd_data;
-	struct iscsi_session *session;
 	struct iser_conn *ib_conn;
 	struct iscsi_endpoint *ep;
 	int error;
@@ -363,9 +362,17 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
 	}
 	ib_conn = ep->dd_data;
 
-	session = conn->session;
-	if (iser_alloc_rx_descriptors(ib_conn, session))
-		return -ENOMEM;
+	mutex_lock(&ib_conn->state_mutex);
+	if (ib_conn->state != ISER_CONN_UP) {
+		error = -EINVAL;
+		iser_err("iser_conn %p state is %d, teardown started\n",
+			 ib_conn, ib_conn->state);
+		goto out;
+	}
+
+	error = iser_alloc_rx_descriptors(ib_conn, conn->session);
+	if (error)
+		goto out;
 
 	/* binds the iSER connection retrieved from the previously
 	 * connected ep_handle to the iSCSI layer connection. exchanges
@@ -375,7 +382,9 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
 	conn->dd_data = ib_conn;
 	ib_conn->iscsi_conn = conn;
 
-	return 0;
+out:
+	mutex_unlock(&ib_conn->state_mutex);
+	return error;
 }
 
 static int

From 61aabb3c91c1b03478ffc1a4a2573f825e7f35f9 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Tue, 2 Sep 2014 17:08:43 +0300
Subject: [PATCH 14/26] IB/iser: Bump version to 1.4.1

Signed-off-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/ulp/iser/iscsi_iser.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index c877dad381cb..9f0e0e34d6ca 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -69,7 +69,7 @@
 
 #define DRV_NAME	"iser"
 #define PFX		DRV_NAME ": "
-#define DRV_VER		"1.4"
+#define DRV_VER		"1.4.1"
 
 #define iser_dbg(fmt, arg...)				\
 	do {						\

From e381835cf1b8e3b2857277dbc3b77d8c5350f70a Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Thu, 21 Aug 2014 14:28:37 +0300
Subject: [PATCH 15/26] IB/mlx4: Avoid null pointer dereference in
 mlx4_ib_scan_netdevs()

When Ethernet netdev is not present for a port (e.g. when the link
layer type of the port is InfiniBand) it's possible to dereference a
null pointer when we do netdevice scanning.

To fix that, we move a section of code that needs to run only when
netdev is present to a proper if () statement.

Fixes: ad4885d279b6 ("IB/mlx4: Build the port IBoE GID table properly under bonding")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/main.c | 49 ++++++++++++++++---------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 20f731f08c7e..bf09d79afd4d 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1789,32 +1789,35 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 			port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ?
 						IB_PORT_ACTIVE : IB_PORT_DOWN;
 			mlx4_ib_set_default_gid(ibdev, curr_netdev, port);
+			/* if using bonding/team and a slave port is down, we
+			 * don't the bond IP based gids in the table since
+			 * flows that select port by gid may get the down port.
+			 */
+			if (curr_master && (port_state == IB_PORT_DOWN)) {
+				reset_gid_table(ibdev, port);
+				mlx4_ib_set_default_gid(ibdev,
+							curr_netdev, port);
+			}
+			/* if bonding is used it is possible that we add it to
+			 * masters only after IP address is assigned to the
+			 * net bonding interface.
+			*/
+			if (curr_master && (old_master != curr_master)) {
+				reset_gid_table(ibdev, port);
+				mlx4_ib_set_default_gid(ibdev,
+							curr_netdev, port);
+				mlx4_ib_get_dev_addr(curr_master, ibdev, port);
+			}
+
+			if (!curr_master && (old_master != curr_master)) {
+				reset_gid_table(ibdev, port);
+				mlx4_ib_set_default_gid(ibdev,
+							curr_netdev, port);
+				mlx4_ib_get_dev_addr(curr_netdev, ibdev, port);
+			}
 		} else {
 			reset_gid_table(ibdev, port);
 		}
-		/* if using bonding/team and a slave port is down, we don't the bond IP
-		 * based gids in the table since flows that select port by gid may get
-		 * the down port.
-		 */
-		if (curr_master && (port_state == IB_PORT_DOWN)) {
-			reset_gid_table(ibdev, port);
-			mlx4_ib_set_default_gid(ibdev, curr_netdev, port);
-		}
-		/* if bonding is used it is possible that we add it to masters
-		 * only after IP address is assigned to the net bonding
-		 * interface.
-		*/
-		if (curr_master && (old_master != curr_master)) {
-			reset_gid_table(ibdev, port);
-			mlx4_ib_set_default_gid(ibdev, curr_netdev, port);
-			mlx4_ib_get_dev_addr(curr_master, ibdev, port);
-		}
-
-		if (!curr_master && (old_master != curr_master)) {
-			reset_gid_table(ibdev, port);
-			mlx4_ib_set_default_gid(ibdev, curr_netdev, port);
-			mlx4_ib_get_dev_addr(curr_netdev, ibdev, port);
-		}
 	}
 
 	spin_unlock(&iboe->lock);

From f5c4834d9328c4ed9fe5dcbec6128d6da16db69a Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Thu, 21 Aug 2014 14:28:38 +0300
Subject: [PATCH 16/26] IB/mlx4: Don't duplicate the default RoCE GID

When reading the IPv6 addresses from the net-device, make sure to
avoid adding a duplicate entry to the GID table because of equality
between the default GID we generate and the default IPv6 link-local
address of the device.

Fixes: acc4fccf4eff ("IB/mlx4: Make sure GID index 0 is always occupied")
Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index bf09d79afd4d..16fb2327813a 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1679,6 +1679,7 @@ static void mlx4_ib_get_dev_addr(struct net_device *dev,
 	struct inet6_dev *in6_dev;
 	union ib_gid  *pgid;
 	struct inet6_ifaddr *ifp;
+	union ib_gid default_gid;
 #endif
 	union ib_gid gid;
 
@@ -1699,12 +1700,15 @@ static void mlx4_ib_get_dev_addr(struct net_device *dev,
 		in_dev_put(in_dev);
 	}
 #if IS_ENABLED(CONFIG_IPV6)
+	mlx4_make_default_gid(dev, &default_gid);
 	/* IPv6 gids */
 	in6_dev = in6_dev_get(dev);
 	if (in6_dev) {
 		read_lock_bh(&in6_dev->lock);
 		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
 			pgid = (union ib_gid *)&ifp->addr;
+			if (!memcmp(pgid, &default_gid, sizeof(*pgid)))
+				continue;
 			update_gid_table(ibdev, port, pgid, 0, 0);
 		}
 		read_unlock_bh(&in6_dev->lock);

From 655b2aaefc353604f9975c31960d9722e6eda449 Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Thu, 21 Aug 2014 14:28:39 +0300
Subject: [PATCH 17/26] IB/mlx4: Reorder steps in RoCE GID table initialization

There's no need to reset the gid table twice and we need to do it only
for Ethernet ports. Also, no need to actively scan ndetdevs since it's
being done immediatly after we register netdev notifiers.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/main.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 16fb2327813a..49965b692042 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1730,24 +1730,33 @@ static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev)
 	struct	net_device *dev;
 	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
 	int i;
+	int err = 0;
 
-	for (i = 1; i <= ibdev->num_ports; ++i)
-		if (reset_gid_table(ibdev, i))
-			return -1;
+	for (i = 1; i <= ibdev->num_ports; ++i) {
+		if (rdma_port_get_link_layer(&ibdev->ib_dev, i) ==
+		    IB_LINK_LAYER_ETHERNET) {
+			err = reset_gid_table(ibdev, i);
+			if (err)
+				goto out;
+		}
+	}
 
 	read_lock(&dev_base_lock);
 	spin_lock(&iboe->lock);
 
 	for_each_netdev(&init_net, dev) {
 		u8 port = mlx4_ib_get_dev_port(dev, ibdev);
-		if (port)
+		/* port will be non-zero only for ETH ports */
+		if (port) {
+			mlx4_ib_set_default_gid(ibdev, dev, port);
 			mlx4_ib_get_dev_addr(dev, ibdev, port);
+		}
 	}
 
 	spin_unlock(&iboe->lock);
 	read_unlock(&dev_base_lock);
-
-	return 0;
+out:
+	return err;
 }
 
 static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
@@ -2202,12 +2211,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 			}
 		}
 #endif
-		for (i = 1 ; i <= ibdev->num_ports ; ++i)
-			reset_gid_table(ibdev, i);
-		rtnl_lock();
-		mlx4_ib_scan_netdevs(ibdev, NULL, 0);
-		rtnl_unlock();
-		mlx4_ib_init_gid_table(ibdev);
+		if (mlx4_ib_init_gid_table(ibdev))
+			goto err_notif;
 	}
 
 	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {

From bccb84f1dfab92ed180adf09c76cfa9ddc90edb9 Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Thu, 21 Aug 2014 14:28:40 +0300
Subject: [PATCH 18/26] IB/mlx4: Get upper dev addresses as RoCE GIDs when port
 comes up

When a RoCE port becomes active and the netdev of the port has upper
device (e.g bond/team), GIDs derived from the upper dev should appear
in the port's RoCE GID table.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/main.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 49965b692042..d404a2eafa79 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1802,14 +1802,23 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 			port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ?
 						IB_PORT_ACTIVE : IB_PORT_DOWN;
 			mlx4_ib_set_default_gid(ibdev, curr_netdev, port);
-			/* if using bonding/team and a slave port is down, we
-			 * don't the bond IP based gids in the table since
-			 * flows that select port by gid may get the down port.
-			 */
-			if (curr_master && (port_state == IB_PORT_DOWN)) {
-				reset_gid_table(ibdev, port);
-				mlx4_ib_set_default_gid(ibdev,
-							curr_netdev, port);
+			if (curr_master) {
+				/* if using bonding/team and a slave port is down, we
+				 * don't want the bond IP based gids in the table since
+				 * flows that select port by gid may get the down port.
+				*/
+				if (port_state == IB_PORT_DOWN) {
+					reset_gid_table(ibdev, port);
+					mlx4_ib_set_default_gid(ibdev,
+								curr_netdev,
+								port);
+				} else {
+					/* gids from the upper dev (bond/team)
+					 * should appear in port's gid table
+					*/
+					mlx4_ib_get_dev_addr(curr_master,
+							     ibdev, port);
+				}
 			}
 			/* if bonding is used it is possible that we add it to
 			 * masters only after IP address is assigned to the

From dba3ad2addcd74ec850e510f3b8a9d046cc24ef3 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 21 Aug 2014 14:28:41 +0300
Subject: [PATCH 19/26] IB/mlx4: Fix lockdep splat for the iboe lock

Chuck Lever reported the following stack trace:

    =================================
    [ INFO: inconsistent lock state ]
    3.16.0-rc2-00024-g2e78883 #17 Tainted: G            E
    ---------------------------------
    inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
    swapper/0/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
    (&(&iboe->lock)->rlock){+.?...}, at: [<ffffffffa065f68b>] mlx4_ib_addr_event+0xdb/0x1a0 [mlx4_ib]
    {SOFTIRQ-ON-W} state was registered at:
     [<ffffffff810b3110>] mark_irqflags+0x110/0x170
     [<ffffffff810b4806>] __lock_acquire+0x2c6/0x5b0
     [<ffffffff810b4bd9>] lock_acquire+0xe9/0x120
     [<ffffffff815f7f6e>] _raw_spin_lock+0x3e/0x80
     [<ffffffffa0661084>] mlx4_ib_scan_netdevs+0x34/0x260 [mlx4_ib]
     [<ffffffffa06612db>] mlx4_ib_netdev_event+0x2b/0x40 [mlx4_ib]
     [<ffffffff81522219>] register_netdevice_notifier+0x99/0x1e0
     [<ffffffffa06626e3>] mlx4_ib_add+0x743/0xbc0 [mlx4_ib]
     [<ffffffffa05ec168>] mlx4_add_device+0x48/0xa0 [mlx4_core]
     [<ffffffffa05ec2c3>] mlx4_register_interface+0x73/0xb0 [mlx4_core]
     [<ffffffffa05c505e>] cm_req_handler+0x13e/0x460 [ib_cm]
     [<ffffffff810002e2>] do_one_initcall+0x112/0x1c0
     [<ffffffff810e8264>] do_init_module+0x34/0x190
     [<ffffffff810ea62f>] load_module+0x5cf/0x740
     [<ffffffff810ea939>] SyS_init_module+0x99/0xd0
     [<ffffffff815f8fd2>] system_call_fastpath+0x16/0x1b
    irq event stamp: 336142
    hardirqs last  enabled at (336142): [<ffffffff810612f5>] __local_bh_enable_ip+0xb5/0xc0
    hardirqs last disabled at (336141): [<ffffffff81061296>] __local_bh_enable_ip+0x56/0xc0
    softirqs last  enabled at (336004): [<ffffffff8106123a>] _local_bh_enable+0x4a/0x50
    softirqs last disabled at (336005): [<ffffffff810617a4>] irq_exit+0x44/0xd0

    other info that might help us debug this:
    Possible unsafe locking scenario:

          CPU0
          ----
     lock(&(&iboe->lock)->rlock);
     <Interrupt>
       lock(&(&iboe->lock)->rlock);

    *** DEADLOCK ***

The above problem was caused by the spin lock being taken both in the process
context and in a soft-irq context (in a netdev notifier handler).

The required fix is to use spin_lock/unlock_bh() instead of spin_lock/unlock
on the iboe lock.

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/main.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index d404a2eafa79..c231112396b2 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -360,7 +360,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
 	props->state		= IB_PORT_DOWN;
 	props->phys_state	= state_to_phys_state(props->state);
 	props->active_mtu	= IB_MTU_256;
-	spin_lock(&iboe->lock);
+	spin_lock_bh(&iboe->lock);
 	ndev = iboe->netdevs[port - 1];
 	if (!ndev)
 		goto out_unlock;
@@ -372,7 +372,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
 					IB_PORT_ACTIVE : IB_PORT_DOWN;
 	props->phys_state	= state_to_phys_state(props->state);
 out_unlock:
-	spin_unlock(&iboe->lock);
+	spin_unlock_bh(&iboe->lock);
 out:
 	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
 	return err;
@@ -814,11 +814,11 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
 	if (!mqp->port)
 		return 0;
 
-	spin_lock(&mdev->iboe.lock);
+	spin_lock_bh(&mdev->iboe.lock);
 	ndev = mdev->iboe.netdevs[mqp->port - 1];
 	if (ndev)
 		dev_hold(ndev);
-	spin_unlock(&mdev->iboe.lock);
+	spin_unlock_bh(&mdev->iboe.lock);
 
 	if (ndev) {
 		ret = 1;
@@ -1265,11 +1265,11 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	mutex_lock(&mqp->mutex);
 	ge = find_gid_entry(mqp, gid->raw);
 	if (ge) {
-		spin_lock(&mdev->iboe.lock);
+		spin_lock_bh(&mdev->iboe.lock);
 		ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL;
 		if (ndev)
 			dev_hold(ndev);
-		spin_unlock(&mdev->iboe.lock);
+		spin_unlock_bh(&mdev->iboe.lock);
 		if (ndev)
 			dev_put(ndev);
 		list_del(&ge->list);
@@ -1554,7 +1554,7 @@ static int mlx4_ib_addr_event(int event, struct net_device *event_netdev,
 		return 0;
 
 	iboe = &ibdev->iboe;
-	spin_lock(&iboe->lock);
+	spin_lock_bh(&iboe->lock);
 
 	for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
 		if ((netif_is_bond_master(real_dev) &&
@@ -1564,7 +1564,7 @@ static int mlx4_ib_addr_event(int event, struct net_device *event_netdev,
 			update_gid_table(ibdev, port, gid,
 					 event == NETDEV_DOWN, 0);
 
-	spin_unlock(&iboe->lock);
+	spin_unlock_bh(&iboe->lock);
 	return 0;
 
 }
@@ -1742,7 +1742,7 @@ static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev)
 	}
 
 	read_lock(&dev_base_lock);
-	spin_lock(&iboe->lock);
+	spin_lock_bh(&iboe->lock);
 
 	for_each_netdev(&init_net, dev) {
 		u8 port = mlx4_ib_get_dev_port(dev, ibdev);
@@ -1753,7 +1753,7 @@ static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev)
 		}
 	}
 
-	spin_unlock(&iboe->lock);
+	spin_unlock_bh(&iboe->lock);
 	read_unlock(&dev_base_lock);
 out:
 	return err;
@@ -1770,7 +1770,7 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 
 	iboe = &ibdev->iboe;
 
-	spin_lock(&iboe->lock);
+	spin_lock_bh(&iboe->lock);
 	mlx4_foreach_ib_transport_port(port, ibdev->dev) {
 		enum ib_port_state	port_state = IB_PORT_NOP;
 		struct net_device *old_master = iboe->masters[port - 1];
@@ -1842,7 +1842,7 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 		}
 	}
 
-	spin_unlock(&iboe->lock);
+	spin_unlock_bh(&iboe->lock);
 
 	if (update_qps_port > 0)
 		mlx4_ib_update_qps(ibdev, dev, update_qps_port);

From 4bf9715f184969dc703bde7be94919995024a6a9 Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Thu, 21 Aug 2014 14:28:42 +0300
Subject: [PATCH 20/26] IB/mlx4: Avoid executing gid task when device is being
 removed

When device is being removed (e.g during VPI port link type change
from ETH to IB), tasks for gid table changes should not be executed.

Flush the current queue of tasks and block further tasks from entering the queue.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/main.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index c231112396b2..c61bcee9ee8a 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1390,6 +1390,9 @@ static void update_gids_task(struct work_struct *work)
 	int err;
 	struct mlx4_dev	*dev = gw->dev->dev;
 
+	if (!gw->dev->ib_active)
+		return;
+
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox)) {
 		pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox));
@@ -1420,6 +1423,9 @@ static void reset_gids_task(struct work_struct *work)
 	int err;
 	struct mlx4_dev	*dev = gw->dev->dev;
 
+	if (!gw->dev->ib_active)
+		return;
+
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox)) {
 		pr_warn("reset gid table failed\n");
@@ -2369,6 +2375,9 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 	struct mlx4_ib_dev *ibdev = ibdev_ptr;
 	int p;
 
+	ibdev->ib_active = false;
+	flush_workqueue(wq);
+
 	mlx4_ib_close_sriov(ibdev);
 	mlx4_ib_mad_cleanup(ibdev);
 	ib_unregister_device(&ibdev->ib_dev);

From a59c5850f09b4c2d6ad2fc47e5e1be8d654529d6 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Tue, 2 Sep 2014 15:32:34 +0300
Subject: [PATCH 21/26] IB/core: When marshaling uverbs path, clear unused
 fields

When marsheling a user path to the kernel struct ib_sa_path, need
to zero smac, dmac and set the vlan id to the "no vlan" value.

Fixes: dd5f03beb4f7 ("IB/core: Ethernet L2 attributes in verbs/cm structures")
Reported-by: Aleksey Senin <alekseys@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/core/uverbs_marshall.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
index e7bee46868d1..abd97247443e 100644
--- a/drivers/infiniband/core/uverbs_marshall.c
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -140,5 +140,9 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
 	dst->packet_life_time	= src->packet_life_time;
 	dst->preference		= src->preference;
 	dst->packet_life_time_selector = src->packet_life_time_selector;
+
+	memset(dst->smac, 0, sizeof(dst->smac));
+	memset(dst->dmac, 0, sizeof(dst->dmac));
+	dst->vlan_id = 0xffff;
 }
 EXPORT_SYMBOL(ib_copy_path_rec_from_user);

From f4fd40b26bd597e203639281859a758402550d62 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 11 Sep 2014 14:11:16 +0300
Subject: [PATCH 22/26] mlx4: Fix mlx4 reg/unreg mac to work properly with
 0-mac addresses

There is a chance that the VF mlx4 RoCE driver (mlx4_ib) may see a 0-mac
as the current default MAC address when a RoCE interface first comes up.

In this case, the RoCE driver registers the 0-mac to get its MAC index --
used in the INIT2RTR transition when it creates its proxy Q1 qp's.

If we do not allow QP1 to be created, the RoCE driver will not come up.
If we do not register the 0-mac, but simply use a random mac-index,
QP1 will attempt to send packets with an someone's else source MAC which
will get the system into more troubled.

Since a 0-mac was previously used to indicate a free slot, this leads to
errors, both when the 0-mac is registered and when it is unregistered.

The required fix is to check in addition that the slot containing the
0-mac has a reference count of zero.

Additionally, when comparing MAC addresses, need to mask out the 2 MSBs
of the u64 mac on both sides of the comparison.

Note that when the EN driver (mlx4_en) comes up, it set itself a proper
mac --> the RoCE driver gets to be notified on that and further handing
is done with the update qp command, as was added by commit 9433c188915c
("IB/mlx4: Invoke UPDATE_QP for proxy QP1 on MAC changes").

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/net/ethernet/mellanox/mlx4/port.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index 9ba0c1ca10d5..94eeb2c7d7e4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -103,7 +103,8 @@ static int find_index(struct mlx4_dev *dev,
 	int i;
 
 	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
-		if ((mac & MLX4_MAC_MASK) ==
+		if (table->refs[i] &&
+		    (MLX4_MAC_MASK & mac) ==
 		    (MLX4_MAC_MASK & be64_to_cpu(table->entries[i])))
 			return i;
 	}
@@ -165,12 +166,14 @@ int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 
 	mutex_lock(&table->mutex);
 	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
-		if (free < 0 && !table->entries[i]) {
-			free = i;
+		if (!table->refs[i]) {
+			if (free < 0)
+				free = i;
 			continue;
 		}
 
-		if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+		if ((MLX4_MAC_MASK & mac) ==
+		     (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
 			/* MAC already registered, increment ref count */
 			err = i;
 			++table->refs[i];

From 3e0629cb6c0518423c9e2671bbe8ec15dde5dcaf Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 11 Sep 2014 14:11:17 +0300
Subject: [PATCH 23/26] IB/mlx4: Avoid accessing netdevice when building RoCE
 qp1 header

The source MAC is needed in RoCE when building the QP1 header.

Currently, this is obtained from the source net device. However, the net
device may not yet exist, or can be destroyed in parallel to this QP1 send
operation (e.g through the VPI port change flow) so accessing it may cause
a kernel crash.

To fix this, we maintain a source MAC cache per port for the net device in
struct mlx4_ib_roce.  This cached MAC is initialized to be the default MAC
address obtained during HCA initialization via QUERY_PORT. This cached MAC
is updated via the netdev event notifier handler.

Since the cached MAC is held in an atomic64 object, we do not need locking
when accessing it.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/main.c    |  5 ++++
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  1 +
 drivers/infiniband/hw/mlx4/qp.c      | 38 ++++++++++++++++------------
 3 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index c61bcee9ee8a..657ce0f810fd 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1643,6 +1643,8 @@ static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
 	new_smac = mlx4_mac_to_u64(dev->dev_addr);
 	read_unlock(&dev_base_lock);
 
+	atomic64_set(&ibdev->iboe.mac[port - 1], new_smac);
+
 	mutex_lock(&ibdev->qp1_proxy_lock[port - 1]);
 	qp = ibdev->qp1_proxy[port - 1];
 	if (qp) {
@@ -2190,6 +2192,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 			goto err_steer_free_bitmap;
 	}
 
+	for (j = 1; j <= ibdev->dev->caps.num_ports; j++)
+		atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]);
+
 	if (ib_register_device(&ibdev->ib_dev, NULL))
 		goto err_steer_free_bitmap;
 
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index e8cad3926bfc..6eb743f65f6f 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -451,6 +451,7 @@ struct mlx4_ib_iboe {
 	spinlock_t		lock;
 	struct net_device      *netdevs[MLX4_MAX_PORTS];
 	struct net_device      *masters[MLX4_MAX_PORTS];
+	atomic64_t		mac[MLX4_MAX_PORTS];
 	struct notifier_block 	nb;
 	struct notifier_block	nb_inet;
 	struct notifier_block	nb_inet6;
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 67780452f0cf..25e0208588e6 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1390,18 +1390,10 @@ static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, u8 *smac,
 				    struct mlx4_qp_context *context)
 {
-	struct net_device *ndev;
 	u64 u64_mac;
 	int smac_index;
 
-
-	ndev = dev->iboe.netdevs[qp->port - 1];
-	if (ndev) {
-		smac = ndev->dev_addr;
-		u64_mac = mlx4_mac_to_u64(smac);
-	} else {
-		u64_mac = dev->dev->caps.def_mac[qp->port];
-	}
+	u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]);
 
 	context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6);
 	if (!qp->pri.smac) {
@@ -2083,6 +2075,16 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
 	return 0;
 }
 
+static void mlx4_u64_to_smac(u8 *dst_mac, u64 src_mac)
+{
+	int i;
+
+	for (i = ETH_ALEN; i; i--) {
+		dst_mac[i - 1] = src_mac & 0xff;
+		src_mac >>= 8;
+	}
+}
+
 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 			    void *wqe, unsigned *mlx_seg_len)
 {
@@ -2197,7 +2199,6 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 	}
 
 	if (is_eth) {
-		u8 *smac;
 		struct in6_addr in6;
 
 		u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
@@ -2210,12 +2211,17 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 		memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
 		memcpy(&in6, sgid.raw, sizeof(in6));
 
-		if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev))
-			smac = to_mdev(sqp->qp.ibqp.device)->
-				iboe.netdevs[sqp->qp.port - 1]->dev_addr;
-		else	/* use the src mac of the tunnel */
-			smac = ah->av.eth.s_mac;
-		memcpy(sqp->ud_header.eth.smac_h, smac, 6);
+		if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+			u64 mac = atomic64_read(&to_mdev(ib_dev)->iboe.mac[sqp->qp.port - 1]);
+			u8 smac[ETH_ALEN];
+
+			mlx4_u64_to_smac(smac, mac);
+			memcpy(sqp->ud_header.eth.smac_h, smac, ETH_ALEN);
+		} else {
+			/* use the src mac of the tunnel */
+			memcpy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac, ETH_ALEN);
+		}
+
 		if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
 			mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
 		if (!is_vlan) {

From d24d9f43384b5933867a5786934e130efa8b5c92 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 11 Sep 2014 14:11:18 +0300
Subject: [PATCH 24/26] IB/mlx4: Don't update QP1 in native mode

For native functions (non-SR-IOV), there's no reason to update
the smac_index, as QP1 is a GSI QP.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 657ce0f810fd..6ad7f7a0e464 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1645,6 +1645,10 @@ static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
 
 	atomic64_set(&ibdev->iboe.mac[port - 1], new_smac);
 
+	/* no need for update QP1 and mac registration in non-SRIOV */
+	if (!mlx4_is_mfunc(ibdev->dev))
+		return;
+
 	mutex_lock(&ibdev->qp1_proxy_lock[port - 1]);
 	qp = ibdev->qp1_proxy[port - 1];
 	if (qp) {

From 3dec48788817fce4a029cbab14f2b407ae78478f Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 11 Sep 2014 14:11:19 +0300
Subject: [PATCH 25/26] IB/mlx4: Do not allow APM under RoCE

Automatic Path Migration is not supported under RoCE. Therefore,
return a "not-supported" error if the caller attempts to set an
alternate path in a QP context.

In addition, if there are no IB ports configured, do not report
APM capability in the device flags returned by mlx4_ib_query_device.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/main.c | 16 +++++++++++++++-
 drivers/infiniband/hw/mlx4/qp.c   |  6 ++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 6ad7f7a0e464..d57563b9d1fb 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -120,6 +120,17 @@ static int check_flow_steering_support(struct mlx4_dev *dev)
 	return dmfs;
 }
 
+static int num_ib_ports(struct mlx4_dev *dev)
+{
+	int ib_ports = 0;
+	int i;
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		ib_ports++;
+
+	return ib_ports;
+}
+
 static int mlx4_ib_query_device(struct ib_device *ibdev,
 				struct ib_device_attr *props)
 {
@@ -127,6 +138,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 	struct ib_smp *in_mad  = NULL;
 	struct ib_smp *out_mad = NULL;
 	int err = -ENOMEM;
+	int have_ib_ports;
 
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
 	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
@@ -143,6 +155,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 
 	memset(props, 0, sizeof *props);
 
+	have_ib_ports = num_ib_ports(dev->dev);
+
 	props->fw_ver = dev->dev->caps.fw_ver;
 	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
 		IB_DEVICE_PORT_ACTIVE_EVENT		|
@@ -153,7 +167,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
 		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
-	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM)
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM && have_ib_ports)
 		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT)
 		props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 25e0208588e6..393423c69c00 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1424,6 +1424,12 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	int steer_qp = 0;
 	int err = -EINVAL;
 
+	/* APM is not supported under RoCE */
+	if (attr_mask & IB_QP_ALT_PATH &&
+	    rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+	    IB_LINK_LAYER_ETHERNET)
+		return -ENOTSUPP;
+
 	context = kzalloc(sizeof *context, GFP_KERNEL);
 	if (!context)
 		return -ENOMEM;

From 25476b0209b2e48dfb689e1b4cf7278875082b1f Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 11 Sep 2014 14:11:20 +0300
Subject: [PATCH 26/26] IB/mlx4: Fix VF mac handling in RoCE

We had several problems here.  First, a race condition on QP1 mac
handling between mlx4_ib_update_qps and mlx4_ib_modify_qp, which is
fixed by taking the qp mutex in mlx4_ib_update_qps.

Also, qp->pri.smac_port was not updated in mlx4_ib_update_qps.

Last, in __mlx4_ib_modify_qp we did not properly handle the case where
the mac is zero, but port is non-zero.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/main.c | 15 ++++++++++-----
 drivers/infiniband/hw/mlx4/qp.c   | 16 ++++++++++------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index d57563b9d1fb..c7586a1cec30 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1667,9 +1667,11 @@ static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
 	qp = ibdev->qp1_proxy[port - 1];
 	if (qp) {
 		int new_smac_index;
-		u64 old_smac = qp->pri.smac;
+		u64 old_smac;
 		struct mlx4_update_qp_params update_params;
 
+		mutex_lock(&qp->mutex);
+		old_smac = qp->pri.smac;
 		if (new_smac == old_smac)
 			goto unlock;
 
@@ -1684,17 +1686,20 @@ static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
 			release_mac = new_smac;
 			goto unlock;
 		}
-
+		/* if old port was zero, no mac was yet registered for this QP */
+		if (qp->pri.smac_port)
+			release_mac = old_smac;
 		qp->pri.smac = new_smac;
+		qp->pri.smac_port = port;
 		qp->pri.smac_index = new_smac_index;
-
-		release_mac = old_smac;
 	}
 
 unlock:
-	mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]);
 	if (release_mac != MLX4_IB_INVALID_MAC)
 		mlx4_unregister_mac(ibdev->dev, port, release_mac);
+	if (qp)
+		mutex_unlock(&qp->mutex);
+	mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]);
 }
 
 static void mlx4_ib_get_dev_addr(struct net_device *dev,
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 393423c69c00..577b477cfaa6 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -964,9 +964,10 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
 			pr_warn("modify QP %06x to RESET failed.\n",
 			       qp->mqp.qpn);
-		if (qp->pri.smac) {
+		if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
 			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
 			qp->pri.smac = 0;
+			qp->pri.smac_port = 0;
 		}
 		if (qp->alt.smac) {
 			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
@@ -1325,7 +1326,8 @@ static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
 		 * If one was already assigned, but the new mac differs,
 		 * unregister the old one and register the new one.
 		*/
-		if (!smac_info->smac || smac_info->smac != smac) {
+		if ((!smac_info->smac && !smac_info->smac_port) ||
+		    smac_info->smac != smac) {
 			/* register candidate now, unreg if needed, after success */
 			smac_index = mlx4_register_mac(dev->dev, port, smac);
 			if (smac_index >= 0) {
@@ -1396,7 +1398,7 @@ static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *
 	u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]);
 
 	context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6);
-	if (!qp->pri.smac) {
+	if (!qp->pri.smac && !qp->pri.smac_port) {
 		smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac);
 		if (smac_index >= 0) {
 			qp->pri.candidate_smac_index = smac_index;
@@ -1778,9 +1780,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			if (qp->flags & MLX4_IB_QP_NETIF)
 				mlx4_ib_steer_qp_reg(dev, qp, 0);
 		}
-		if (qp->pri.smac) {
+		if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
 			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
 			qp->pri.smac = 0;
+			qp->pri.smac_port = 0;
 		}
 		if (qp->alt.smac) {
 			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
@@ -1804,11 +1807,12 @@ out:
 	if (err && steer_qp)
 		mlx4_ib_steer_qp_reg(dev, qp, 0);
 	kfree(context);
-	if (qp->pri.candidate_smac) {
+	if (qp->pri.candidate_smac ||
+	    (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) {
 		if (err) {
 			mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac);
 		} else {
-			if (qp->pri.smac)
+			if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port))
 				mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
 			qp->pri.smac = qp->pri.candidate_smac;
 			qp->pri.smac_index = qp->pri.candidate_smac_index;