Blob Blame History Raw
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 9 Oct 2019 13:09:23 -0300
Subject: RDMA/mlx5: Use a dedicated mkey xarray for ODP
Patch-mainline: v5.5-rc1
Git-commit: 806b101b2bfa800a9c779336b750bee39c7fb3b4
References: jsc#SLE-8446

There is a per device xarray storing mkeys that is used to store every
mkey in the system. However, this xarray is now only read by ODP for
certain ODP designated MRs (ODP, implicit ODP, MW, DEVX_INDIRECT).

Create an xarray only for use by ODP, that only contains ODP related
MKeys. This xarray is protected by SRCU and all erases are protected by a
synchronize.

This improves performance:

 - All MRs in the odp_mkeys xarray are ODP MRs, so some tests for is_odp()
   can be deleted. The xarray will also consume fewer nodes.

 - normal MR's are never mixed with ODP MRs in a SRCU data structure so
   performance sucking synchronize_srcu() on every MR destruction is not
   needed.

 - No smp_load_acquire(live) and xa_load() double barrier on read

Due to the SRCU locking scheme care must be taken with the placement of
the xa_store(). Once it completes the MR is immediately visible to other
threads and only through a xa_erase() & synchronize_srcu() cycle could it
be destroyed.

Link: https://lore.kernel.org/r/20191009160934.3143-4-jgg@ziepe.ca
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/infiniband/hw/mlx5/devx.c    |    8 +--
 drivers/infiniband/hw/mlx5/main.c    |   17 +++----
 drivers/infiniband/hw/mlx5/mlx5_ib.h |    5 +-
 drivers/infiniband/hw/mlx5/mr.c      |   43 +++++++++---------
 drivers/infiniband/hw/mlx5/odp.c     |   83 ++++++++++++++++++-----------------
 5 files changed, 83 insertions(+), 73 deletions(-)

--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1265,8 +1265,8 @@ static int devx_handle_mkey_indirect(str
 	mkey->pd = MLX5_GET(mkc, mkc, pd);
 	devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size);
 
-	return xa_err(xa_store(&dev->mdev->priv.mkey_table,
-			       mlx5_base_mkey(mkey->key), mkey, GFP_KERNEL));
+	return xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mkey->key), mkey,
+			       GFP_KERNEL));
 }
 
 static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
@@ -1345,9 +1345,9 @@ static int devx_obj_cleanup(struct ib_uo
 		 * the mmkey, we must wait for that to stop before freeing the
 		 * mkey, as another allocation could get the same mkey #.
 		 */
-		xa_erase(&obj->ib_dev->mdev->priv.mkey_table,
+		xa_erase(&obj->ib_dev->odp_mkeys,
 			 mlx5_base_mkey(obj->devx_mr.mmkey.key));
-		synchronize_srcu(&dev->mr_srcu);
+		synchronize_srcu(&dev->odp_srcu);
 	}
 
 	if (obj->flags & DEVX_OBJ_FLAGS_DCT)
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6128,10 +6128,10 @@ static struct ib_counters *mlx5_ib_creat
 static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
 {
 	mlx5_ib_cleanup_multiport_master(dev);
-	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
-		srcu_barrier(&dev->mr_srcu);
-		cleanup_srcu_struct(&dev->mr_srcu);
-	}
+	WARN_ON(!xa_empty(&dev->odp_mkeys));
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+		srcu_barrier(&dev->odp_srcu);
+	cleanup_srcu_struct(&dev->odp_srcu);
 
 	WARN_ON(!xa_empty(&dev->sig_mrs));
 	WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
@@ -6185,16 +6185,15 @@ static int mlx5_ib_stage_init_init(struc
 	mutex_init(&dev->cap_mask_mutex);
 	INIT_LIST_HEAD(&dev->qp_list);
 	spin_lock_init(&dev->reset_flow_resource_lock);
+	xa_init(&dev->odp_mkeys);
 	xa_init(&dev->sig_mrs);
 
 	spin_lock_init(&dev->dm.lock);
 	dev->dm.dev = mdev;
 
-	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
-		err = init_srcu_struct(&dev->mr_srcu);
-		if (err)
-			goto err_mp;
-	}
+	err = init_srcu_struct(&dev->odp_srcu);
+	if (err)
+		goto err_mp;
 
 	return 0;
 
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -604,7 +604,6 @@ struct mlx5_ib_mr {
 	struct mlx5_ib_dev     *dev;
 	u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
 	struct mlx5_core_sig_ctx    *sig;
-	unsigned int		live;
 	void			*descs_alloc;
 	int			access_flags; /* Needed for rereg MR */
 
@@ -977,7 +976,9 @@ struct mlx5_ib_dev {
 	 * Sleepable RCU that prevents destruction of MRs while they are still
 	 * being used by a page fault handler.
 	 */
-	struct srcu_struct      mr_srcu;
+	struct srcu_struct      odp_srcu;
+	struct xarray		odp_mkeys;
+
 	u32			null_mkey;
 	struct mlx5_ib_flow_db	*flow_db;
 	/* protect resources needed as part of reset flow */
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -59,13 +59,9 @@ static bool umr_can_use_indirect_mkey(st
 
 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
-	int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
+	WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
 
-	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
-		/* Wait until all page fault handlers using the mr complete. */
-		synchronize_srcu(&dev->mr_srcu);
-
-	return err;
+	return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 }
 
 static int order2idx(struct mlx5_ib_dev *dev, int order)
@@ -218,9 +214,6 @@ static void remove_keys(struct mlx5_ib_d
 		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 	}
 
-	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
-		synchronize_srcu(&dev->mr_srcu);
-
 	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
 		list_del(&mr->list);
 		kfree(mr);
@@ -555,10 +548,6 @@ static void clean_keys(struct mlx5_ib_de
 		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 	}
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	synchronize_srcu(&dev->mr_srcu);
-#endif
-
 	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
 		list_del(&mr->list);
 		kfree(mr);
@@ -1336,9 +1325,14 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct
 	if (is_odp_mr(mr)) {
 		to_ib_umem_odp(mr->umem)->private = mr;
 		atomic_set(&mr->num_pending_prefetch, 0);
+		err = xa_err(xa_store(&dev->odp_mkeys,
+				      mlx5_base_mkey(mr->mmkey.key), &mr->mmkey,
+				      GFP_KERNEL));
+		if (err) {
+			dereg_mr(dev, mr);
+			return ERR_PTR(err);
+		}
 	}
-	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
-		smp_store_release(&mr->live, 1);
 
 	return &mr->ibmr;
 error:
@@ -1580,10 +1574,10 @@ static void dereg_mr(struct mlx5_ib_dev
 		/* Prevent new page faults and
 		 * prefetch requests from succeeding
 		 */
-		WRITE_ONCE(mr->live, 0);
+		xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
 
 		/* Wait for all running page-fault handlers to finish. */
-		synchronize_srcu(&dev->mr_srcu);
+		synchronize_srcu(&dev->odp_srcu);
 
 		/* dequeue pending prefetch requests for the mr */
 		if (atomic_read(&mr->num_pending_prefetch))
@@ -1957,9 +1951,19 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib
 		}
 	}
 
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+		err = xa_err(xa_store(&dev->odp_mkeys,
+				      mlx5_base_mkey(mw->mmkey.key), &mw->mmkey,
+				      GFP_KERNEL));
+		if (err)
+			goto free_mkey;
+	}
+
 	kfree(in);
 	return &mw->ibmw;
 
+free_mkey:
+	mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
 free:
 	kfree(mw);
 	kfree(in);
@@ -1973,13 +1977,12 @@ int mlx5_ib_dealloc_mw(struct ib_mw *mw)
 	int err;
 
 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
-		xa_erase_irq(&dev->mdev->priv.mkey_table,
-			     mlx5_base_mkey(mmw->mmkey.key));
+		xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key));
 		/*
 		 * pagefault_single_data_segment() may be accessing mmw under
 		 * SRCU if the user bound an ODP MR to this MW.
 		 */
-		synchronize_srcu(&dev->mr_srcu);
+		synchronize_srcu(&dev->odp_srcu);
 	}
 
 	err = mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -199,7 +199,7 @@ void mlx5_odp_populate_klm(struct mlx5_k
 	 * locking around the children list.
 	 */
 	lockdep_assert_held(&to_ib_umem_odp(mr->umem)->umem_mutex);
-	lockdep_assert_held(&mr->dev->mr_srcu);
+	lockdep_assert_held(&mr->dev->odp_srcu);
 
 	odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE,
 			 nentries * MLX5_IMR_MTT_SIZE, mr);
@@ -229,16 +229,16 @@ static void mr_leaf_free_action(struct w
 	int srcu_key;
 
 	mr->parent = NULL;
-	synchronize_srcu(&mr->dev->mr_srcu);
+	synchronize_srcu(&mr->dev->odp_srcu);
 
-	if (smp_load_acquire(&imr->live)) {
-		srcu_key = srcu_read_lock(&mr->dev->mr_srcu);
+	if (xa_load(&mr->dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key))) {
+		srcu_key = srcu_read_lock(&mr->dev->odp_srcu);
 		mutex_lock(&odp_imr->umem_mutex);
 		mlx5_ib_update_xlt(imr, idx, 1, 0,
 				   MLX5_IB_UPD_XLT_INDIRECT |
 				   MLX5_IB_UPD_XLT_ATOMIC);
 		mutex_unlock(&odp_imr->umem_mutex);
-		srcu_read_unlock(&mr->dev->mr_srcu, srcu_key);
+		srcu_read_unlock(&mr->dev->odp_srcu, srcu_key);
 	}
 	ib_umem_odp_release(odp);
 	mlx5_mr_cache_free(mr->dev, mr);
@@ -325,7 +325,7 @@ void mlx5_ib_invalidate_range(struct ib_
 
 	if (unlikely(!umem_odp->npages && mr->parent &&
 		     !umem_odp->dying)) {
-		WRITE_ONCE(mr->live, 0);
+		xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
 		umem_odp->dying = 1;
 		atomic_inc(&mr->parent->num_leaf_free);
 		schedule_work(&umem_odp->work);
@@ -435,6 +435,11 @@ static struct mlx5_ib_mr *implicit_mr_al
 	if (IS_ERR(mr))
 		return mr;
 
+	err = xa_reserve(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
+			 GFP_KERNEL);
+	if (err)
+		goto out_mr;
+
 	mr->ibmr.pd = pd;
 
 	mr->dev = dev;
@@ -460,7 +465,7 @@ static struct mlx5_ib_mr *implicit_mr_al
 	}
 
 	if (err)
-		goto fail;
+		goto out_release;
 
 	mr->ibmr.lkey = mr->mmkey.key;
 	mr->ibmr.rkey = mr->mmkey.key;
@@ -470,7 +475,9 @@ static struct mlx5_ib_mr *implicit_mr_al
 
 	return mr;
 
-fail:
+out_release:
+	xa_release(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
+out_mr:
 	mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
 	mlx5_mr_cache_free(dev, mr);
 
@@ -518,7 +525,8 @@ next_mr:
 		mtt->parent = mr;
 		INIT_WORK(&odp->work, mr_leaf_free_action);
 
-		smp_store_release(&mtt->live, 1);
+		xa_store(&dev->odp_mkeys, mlx5_base_mkey(mtt->mmkey.key),
+			 &mtt->mmkey, GFP_ATOMIC);
 
 		if (!nentries)
 			start_idx = addr >> MLX5_IMR_MTT_SHIFT;
@@ -572,7 +580,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implici
 	init_waitqueue_head(&imr->q_leaf_free);
 	atomic_set(&imr->num_leaf_free, 0);
 	atomic_set(&imr->num_pending_prefetch, 0);
-	smp_store_release(&imr->live, 1);
+	xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key),
+		 &imr->mmkey, GFP_ATOMIC);
 
 	imr->is_odp_implicit = true;
 
@@ -785,13 +794,28 @@ static int pagefault_single_data_segment
 	size_t offset;
 	int ndescs;
 
-	srcu_key = srcu_read_lock(&dev->mr_srcu);
+	srcu_key = srcu_read_lock(&dev->odp_srcu);
 
 	io_virt += *bytes_committed;
 	bcnt -= *bytes_committed;
 
 next_mr:
-	mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(key));
+	mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
+	if (!mmkey) {
+		mlx5_ib_dbg(
+			dev,
+			"skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
+			key);
+		if (bytes_mapped)
+			*bytes_mapped += bcnt;
+		/*
+		 * The user could specify a SGL with multiple lkeys and only
+		 * some of them are ODP. Treat the non-ODP ones as fully
+		 * faulted.
+		 */
+		ret = 0;
+		goto srcu_unlock;
+	}
 	if (!mkey_is_eq(mmkey, key)) {
 		mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
 		ret = -EFAULT;
@@ -801,20 +825,6 @@ next_mr:
 	switch (mmkey->type) {
 	case MLX5_MKEY_MR:
 		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
-		if (!smp_load_acquire(&mr->live) || !mr->ibmr.pd) {
-			mlx5_ib_dbg(dev, "got dead MR\n");
-			ret = -EFAULT;
-			goto srcu_unlock;
-		}
-
-		if (!is_odp_mr(mr)) {
-			mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
-				    key);
-			if (bytes_mapped)
-				*bytes_mapped += bcnt;
-			ret = 0;
-			goto srcu_unlock;
-		}
 
 		ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
 		if (ret < 0)
@@ -916,7 +926,7 @@ srcu_unlock:
 	}
 	kfree(out);
 
-	srcu_read_unlock(&dev->mr_srcu, srcu_key);
+	srcu_read_unlock(&dev->odp_srcu, srcu_key);
 	*bytes_committed = 0;
 	return ret ? ret : npages;
 }
@@ -1637,18 +1647,15 @@ get_prefetchable_mr(struct ib_pd *pd, en
 	struct ib_umem_odp *odp;
 	struct mlx5_ib_mr *mr;
 
-	lockdep_assert_held(&dev->mr_srcu);
+	lockdep_assert_held(&dev->odp_srcu);
 
-	mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(lkey));
+	mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
 	if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR)
 		return NULL;
 
 	mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
 
-	if (!smp_load_acquire(&mr->live))
-		return NULL;
-
-	if (mr->ibmr.pd != pd || !is_odp_mr(mr))
+	if (mr->ibmr.pd != pd)
 		return NULL;
 
 	/*
@@ -1723,7 +1730,7 @@ static int mlx5_ib_prefetch_sg_list(stru
 	int ret = 0;
 	u32 i;
 
-	srcu_key = srcu_read_lock(&dev->mr_srcu);
+	srcu_key = srcu_read_lock(&dev->odp_srcu);
 	for (i = 0; i < num_sge; ++i) {
 		struct mlx5_ib_mr *mr;
 
@@ -1740,7 +1747,7 @@ static int mlx5_ib_prefetch_sg_list(stru
 	ret = 0;
 
 out:
-	srcu_read_unlock(&dev->mr_srcu, srcu_key);
+	srcu_read_unlock(&dev->odp_srcu, srcu_key);
 	return ret;
 }
 
@@ -1764,12 +1771,12 @@ int mlx5_ib_advise_mr_prefetch(struct ib
 	if (!work)
 		return -ENOMEM;
 
-	srcu_key = srcu_read_lock(&dev->mr_srcu);
+	srcu_key = srcu_read_lock(&dev->odp_srcu);
 	if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) {
-		srcu_read_unlock(&dev->mr_srcu, srcu_key);
+		srcu_read_unlock(&dev->odp_srcu, srcu_key);
 		return -EINVAL;
 	}
 	queue_work(system_unbound_wq, &work->work);
-	srcu_read_unlock(&dev->mr_srcu, srcu_key);
+	srcu_read_unlock(&dev->odp_srcu, srcu_key);
 	return 0;
 }