Blob Blame History Raw
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Sun, 29 Jul 2018 11:34:56 +0300
Subject: RDMA/netdev: Use priv_destructor for netdev cleanup
Patch-mainline: v4.19-rc1
Git-commit: 9f49a5b5c21d58aa84e16cfdc5e99e49faefcb7a
References: bsc#1103992 FATE#326009

Now that the unregister_netdev flow for IPoIB no longer relies on external
code we can now introduce the use of priv_destructor and
needs_free_netdev.

The rdma_netdev flow is switched to use the netdev common priv_destructor
instead of the special free_rdma_netdev and the IPOIB ULP adjusted:
 - priv_destructor needs to switch to point to the ULP's destructor
   which will then call the rdma_ndev's in the right order
 - We need to be careful around the error unwind of register_netdev
   as it sometimes calls priv_destructor on failure
 - ULPs need to use ndo_init/uninit to ensure proper ordering
   of failures around register_netdev

Switching to priv_destructor is a necessary pre-requisite to using
the rtnl new_link mechanism.

The VNIC user for rdma_netdev should also be revised, but that is left for
another patch.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Denis Drozdov <denisd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/infiniband/hw/mlx5/main.c                     |   10 -
 drivers/infiniband/ulp/ipoib/ipoib.h                  |    2 
 drivers/infiniband/ulp/ipoib/ipoib_main.c             |  101 +++++++++++-------
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c             |   68 ++++++------
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c |   37 +++---
 include/linux/mlx5/driver.h                           |    3 
 include/rdma/ib_verbs.h                               |    6 -
 7 files changed, 129 insertions(+), 98 deletions(-)

--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -5163,11 +5163,6 @@ done:
 	return num_counters;
 }
 
-static void mlx5_ib_free_rdma_netdev(struct net_device *netdev)
-{
-	return mlx5_rdma_netdev_free(netdev);
-}
-
 static struct net_device*
 mlx5_ib_alloc_rdma_netdev(struct ib_device *hca,
 			  u8 port_num,
@@ -5177,17 +5172,12 @@ mlx5_ib_alloc_rdma_netdev(struct ib_devi
 			  void (*setup)(struct net_device *))
 {
 	struct net_device *netdev;
-	struct rdma_netdev *rn;
 
 	if (type != RDMA_NETDEV_IPOIB)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	netdev = mlx5_rdma_netdev_alloc(to_mdev(hca)->mdev, hca,
 					name, setup);
-	if (likely(!IS_ERR_OR_NULL(netdev))) {
-		rn = netdev_priv(netdev);
-		rn->free_rdma_netdev = mlx5_ib_free_rdma_netdev;
-	}
 	return netdev;
 }
 
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -323,6 +323,7 @@ struct ipoib_dev_priv {
 	spinlock_t lock;
 
 	struct net_device *dev;
+	void (*next_priv_destructor)(struct net_device *dev);
 
 	struct napi_struct send_napi;
 	struct napi_struct recv_napi;
@@ -481,6 +482,7 @@ static inline void ipoib_put_ah(struct i
 	kref_put(&ah->ref, ipoib_free_ah);
 }
 int ipoib_open(struct net_device *dev);
+void ipoib_intf_free(struct net_device *dev);
 int ipoib_add_pkey_attr(struct net_device *dev);
 int ipoib_add_umcast_attr(struct net_device *dev);
 
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -2060,6 +2060,13 @@ void ipoib_setup_common(struct net_devic
 	netif_keep_dst(dev);
 
 	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
+
+	/*
+	 * unregister_netdev always frees the netdev, we use this mode
+	 * consistently to unify all the various unregister paths, including
+	 * those connected to rtnl_link_ops which require it.
+	 */
+	dev->needs_free_netdev = true;
 }
 
 static void ipoib_build_priv(struct net_device *dev)
@@ -2114,9 +2121,7 @@ static struct net_device
 	rn->send = ipoib_send;
 	rn->attach_mcast = ipoib_mcast_attach;
 	rn->detach_mcast = ipoib_mcast_detach;
-	rn->free_rdma_netdev = free_netdev;
 	rn->hca = hca;
-
 	dev->netdev_ops = &ipoib_netdev_default_pf;
 
 	return dev;
@@ -2171,6 +2176,15 @@ struct ipoib_dev_priv *ipoib_intf_alloc(
 
 	rn = netdev_priv(dev);
 	rn->clnt_priv = priv;
+
+	/*
+	 * Only the child register_netdev flows can handle priv_destructor
+	 * being set, so we force it to NULL here and handle manually until it
+	 * is safe to turn on.
+	 */
+	priv->next_priv_destructor = dev->priv_destructor;
+	dev->priv_destructor = NULL;
+
 	ipoib_build_priv(dev);
 
 	return priv;
@@ -2179,6 +2193,27 @@ free_priv:
 	return NULL;
 }
 
+void ipoib_intf_free(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct rdma_netdev *rn = netdev_priv(dev);
+
+	dev->priv_destructor = priv->next_priv_destructor;
+	if (dev->priv_destructor)
+		dev->priv_destructor(dev);
+
+	/*
+	 * There are some error flows around register_netdev failing that may
+	 * attempt to call priv_destructor twice, prevent that from happening.
+	 */
+	dev->priv_destructor = NULL;
+
+	/* unregister/destroy is very complicated. Make bugs more obvious. */
+	rn->clnt_priv = NULL;
+
+	kfree(priv);
+}
+
 static ssize_t show_pkey(struct device *dev,
 			 struct device_attribute *attr, char *buf)
 {
@@ -2339,7 +2374,7 @@ static struct net_device *ipoib_add_port
 					 struct ib_device *hca, u8 port)
 {
 	struct ipoib_dev_priv *priv;
-	struct rdma_netdev *rn;
+	struct net_device *ndev;
 	int result;
 
 	priv = ipoib_intf_alloc(hca, port, format);
@@ -2347,6 +2382,7 @@ static struct net_device *ipoib_add_port
 		pr_warn("%s, %d: ipoib_intf_alloc failed\n", hca->name, port);
 		return ERR_PTR(-ENOMEM);
 	}
+	ndev = priv->dev;
 
 	INIT_IB_EVENT_HANDLER(&priv->event_handler,
 			      priv->ca, ipoib_event);
@@ -2355,38 +2391,43 @@ static struct net_device *ipoib_add_port
 	/* call event handler to ensure pkey in sync */
 	queue_work(ipoib_workqueue, &priv->flush_heavy);
 
-	result = register_netdev(priv->dev);
+	result = register_netdev(ndev);
 	if (result) {
 		pr_warn("%s: couldn't register ipoib port %d; error %d\n",
 			hca->name, port, result);
-		ipoib_parent_unregister_pre(priv->dev);
-		goto device_init_failed;
+
+		ipoib_parent_unregister_pre(ndev);
+		ipoib_intf_free(ndev);
+		free_netdev(ndev);
+
+		return ERR_PTR(result);
 	}
 
-	result = -ENOMEM;
-	if (ipoib_cm_add_mode_attr(priv->dev))
+	/*
+	 * We cannot set priv_destructor before register_netdev because we
+	 * need priv to be always valid during the error flow to execute
+	 * ipoib_parent_unregister_pre(). Instead handle it manually and only
+	 * enter priv_destructor mode once we are completely registered.
+	 */
+	ndev->priv_destructor = ipoib_intf_free;
+
+	if (ipoib_cm_add_mode_attr(ndev))
 		goto sysfs_failed;
-	if (ipoib_add_pkey_attr(priv->dev))
+	if (ipoib_add_pkey_attr(ndev))
 		goto sysfs_failed;
-	if (ipoib_add_umcast_attr(priv->dev))
+	if (ipoib_add_umcast_attr(ndev))
 		goto sysfs_failed;
-	if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
+	if (device_create_file(&ndev->dev, &dev_attr_create_child))
 		goto sysfs_failed;
-	if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
+	if (device_create_file(&ndev->dev, &dev_attr_delete_child))
 		goto sysfs_failed;
 
-	return priv->dev;
+	return ndev;
 
 sysfs_failed:
-	ipoib_parent_unregister_pre(priv->dev);
-	unregister_netdev(priv->dev);
-
-device_init_failed:
-	rn = netdev_priv(priv->dev);
-	rn->free_rdma_netdev(priv->dev);
-	kfree(priv);
-
-	return ERR_PTR(result);
+	ipoib_parent_unregister_pre(ndev);
+	unregister_netdev(ndev);
+	return ERR_PTR(-ENOMEM);
 }
 
 static void ipoib_add_one(struct ib_device *device)
@@ -2424,33 +2465,19 @@ static void ipoib_add_one(struct ib_devi
 
 static void ipoib_remove_one(struct ib_device *device, void *client_data)
 {
-	struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv;
+	struct ipoib_dev_priv *priv, *tmp;
 	struct list_head *dev_list = client_data;
 
 	if (!dev_list)
 		return;
 
 	list_for_each_entry_safe(priv, tmp, dev_list, list) {
-		struct rdma_netdev *parent_rn = netdev_priv(priv->dev);
-
 		ipoib_parent_unregister_pre(priv->dev);
 
 		/* Wrap rtnl_lock/unlock with mutex to protect sysfs calls */
 		mutex_lock(&priv->sysfs_mutex);
 		unregister_netdev(priv->dev);
 		mutex_unlock(&priv->sysfs_mutex);
-
-		parent_rn->free_rdma_netdev(priv->dev);
-
-		list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
-			struct rdma_netdev *child_rn;
-
-			child_rn = netdev_priv(cpriv->dev);
-			child_rn->free_rdma_netdev(cpriv->dev);
-			kfree(cpriv);
-		}
-
-		kfree(priv);
 	}
 
 	kfree(dev_list);
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -50,31 +50,53 @@ static ssize_t show_parent(struct device
 }
 static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL);
 
+/*
+ * NOTE: If this function fails then the priv->dev will remain valid, however
+ * priv can have been freed and must not be touched by caller in the error
+ * case.
+ *
+ * If (ndev->reg_state == NETREG_UNINITIALIZED) then it is up to the caller to
+ * free the net_device (just as rtnl_newlink does) otherwise the net_device
+ * will be freed when the rtnl is unlocked.
+ */
 int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
 		     u16 pkey, int type)
 {
+	struct net_device *ndev = priv->dev;
 	int result;
 
+	ASSERT_RTNL();
+
 	priv->parent = ppriv->dev;
 	priv->pkey = pkey;
 	priv->child_type = type;
 
-	result = register_netdevice(priv->dev);
+	/* We do not need to touch priv if register_netdevice fails */
+	ndev->priv_destructor = ipoib_intf_free;
+
+	result = register_netdevice(ndev);
 	if (result) {
 		ipoib_warn(priv, "failed to initialize; error %i", result);
+
+		/*
+		 * register_netdevice sometimes calls priv_destructor,
+		 * sometimes not. Make sure it was done.
+		 */
+		if (ndev->priv_destructor)
+			ndev->priv_destructor(ndev);
 		return result;
 	}
 
 	/* RTNL childs don't need proprietary sysfs entries */
 	if (type == IPOIB_LEGACY_CHILD) {
-		if (ipoib_cm_add_mode_attr(priv->dev))
+		if (ipoib_cm_add_mode_attr(ndev))
 			goto sysfs_failed;
-		if (ipoib_add_pkey_attr(priv->dev))
+		if (ipoib_add_pkey_attr(ndev))
 			goto sysfs_failed;
-		if (ipoib_add_umcast_attr(priv->dev))
+		if (ipoib_add_umcast_attr(ndev))
 			goto sysfs_failed;
 
-		if (device_create_file(&priv->dev->dev, &dev_attr_parent))
+		if (device_create_file(&ndev->dev, &dev_attr_parent))
 			goto sysfs_failed;
 	}
 
@@ -91,6 +113,7 @@ int ipoib_vlan_add(struct net_device *pd
 {
 	struct ipoib_dev_priv *ppriv, *priv;
 	char intf_name[IFNAMSIZ];
+	struct net_device *ndev;
 	struct ipoib_dev_priv *tpriv;
 	int result;
 
@@ -122,12 +145,6 @@ int ipoib_vlan_add(struct net_device *pd
 		return restart_syscall();
 	}
 
-	priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name);
-	if (!priv) {
-		result = -ENOMEM;
-		goto out;
-	}
-
 	/*
 	 * First ensure this isn't a duplicate. We check the parent device and
 	 * then all of the legacy child interfaces to make sure the Pkey
@@ -146,21 +163,23 @@ int ipoib_vlan_add(struct net_device *pd
 		}
 	}
 
+	priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name);
+	if (!priv) {
+		result = -ENOMEM;
+		goto out;
+	}
+	ndev = priv->dev;
+
 	result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD);
 
+	if (result && ndev->reg_state == NETREG_UNINITIALIZED)
+		free_netdev(ndev);
+
 out:
 	up_write(&ppriv->vlan_rwsem);
 	rtnl_unlock();
 	mutex_unlock(&ppriv->sysfs_mutex);
 
-	if (result && priv) {
-		struct rdma_netdev *rn;
-
-		rn = netdev_priv(priv->dev);
-		rn->free_rdma_netdev(priv->dev);
-		kfree(priv);
-	}
-
 	return result;
 }
 
@@ -212,14 +231,5 @@ int ipoib_vlan_delete(struct net_device
 	rtnl_unlock();
 	mutex_unlock(&ppriv->sysfs_mutex);
 
-	if (dev) {
-		struct rdma_netdev *rn;
-
-		rn = netdev_priv(dev);
-		rn->free_rdma_netdev(priv->dev);
-		kfree(priv);
-		return 0;
-	}
-
-	return -ENODEV;
+	return (dev) ? 0 : -ENODEV;
 }
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -584,6 +584,22 @@ static int mlx5i_check_required_hca_cap(
 	return 0;
 }
 
+static void mlx5_rdma_netdev_free(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	const struct mlx5e_profile *profile = priv->profile;
+
+	mlx5e_detach_netdev(priv);
+	profile->cleanup(priv);
+	destroy_workqueue(priv->wq);
+
+	if (!ipriv->sub_interface) {
+		mlx5i_pkey_qpn_ht_cleanup(netdev);
+		mlx5e_destroy_mdev_resources(priv->mdev);
+	}
+}
+
 struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
 					  struct ib_device *ibdev,
 					  const char *name,
@@ -657,6 +673,9 @@ struct net_device *mlx5_rdma_netdev_allo
 	rn->detach_mcast = mlx5i_detach_mcast;
 	rn->set_id = mlx5i_set_pkey_index;
 
+	netdev->priv_destructor = mlx5_rdma_netdev_free;
+	netdev->needs_free_netdev = 1;
+
 	return netdev;
 
 destroy_ht:
@@ -669,21 +688,3 @@ err_free_netdev:
 	return NULL;
 }
 EXPORT_SYMBOL(mlx5_rdma_netdev_alloc);
-
-void mlx5_rdma_netdev_free(struct net_device *netdev)
-{
-	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
-	struct mlx5i_priv *ipriv = priv->ppriv;
-	const struct mlx5e_profile *profile = priv->profile;
-
-	mlx5e_detach_netdev(priv);
-	profile->cleanup(priv);
-	destroy_workqueue(priv->wq);
-
-	if (!ipriv->sub_interface) {
-		mlx5i_pkey_qpn_ht_cleanup(netdev);
-		mlx5e_destroy_mdev_resources(priv->mdev);
-	}
-	free_netdev(netdev);
-}
-EXPORT_SYMBOL(mlx5_rdma_netdev_free);
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1235,14 +1235,11 @@ struct net_device *mlx5_rdma_netdev_allo
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
-
-static inline void mlx5_rdma_netdev_free(struct net_device *netdev) {}
 #else
 struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
 					  struct ib_device *ibdev,
 					  const char *name,
 					  void (*setup)(struct net_device *));
-void mlx5_rdma_netdev_free(struct net_device *netdev);
 #endif /* CONFIG_MLX5_CORE_IPOIB */
 
 struct mlx5_profile {
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2206,7 +2206,11 @@ struct rdma_netdev {
 	struct ib_device  *hca;
 	u8                 port_num;
 
-	/* cleanup function must be specified */
+	/*
+	 * cleanup function must be specified.
+	 * FIXME: This is only used for OPA_VNIC and that usage should be
+	 * removed too.
+	 */
 	void (*free_rdma_netdev)(struct net_device *netdev);
 
 	/* control functions */