Blob Blame History Raw
From: Karsten Graul <kgraul@linux.ibm.com>
Subject: net/smc: remove duplicate mutex_unlock
Patch-mainline: v4.19-rc6
Git-commit: 1ca52fcfaca43665d525645348801a6f4a4b9e9a
References: FATE#325698, LTC#167867, bsc#1113481

Description:  net/smc: bugfix and compatibility patches
Symptom:      Random hangs in smc processing:
                user space application hangs in socket send() or recv() call or
                does never get a notification from a select() call.
              Missing compatibility to other platforms:
                confirm rkey and delete rkey processing is required by the
                design, but delete rkey processing is missing. This leads to
                protocol failures when communicating with other platforms like
                zOS. The SMC-D shutdown signal support is missing, so there is
                no detection if the remote peer closed the link group.
              Broken administration of available WR send payload buffers due to
              a use-after-free condition.
Problem:      Misbehaviour regarding the user space api can lead to hang
              situations. SMC is not fully compatible to some other platforms
              due to missing rkey processing and SMC-D shutdown signal support.
Solution:     Fixed protocoll deficiencies by implementing the required rkey
              processing. For SMC-D, the cursors are now handled atomically to
              handle parallel modifications. The SMC-D shutdown signal is now
              processed when received and sent to the remote peer if needed.
              Prereq patches are included.
Reproduction: Run SMC on a loaded system against zOS as peer system.

Upstream-Description:

              net/smc: remove duplicate mutex_unlock

              For a failing smc_listen_rdma_finish() smc_listen_decline() is
              called. If fallback is possible, the new socket is already enqueued
              to be accepted in smc_listen_decline(). Avoid enqueuing a second time
              afterwards in this case, otherwise the smc_create_lgr_pending lock
              is released twice:
              [  373.463976] WARNING: bad unlock balance detected!
              [  373.463978] 4.18.0-rc7+ #123 Tainted: G           O
              [  373.463979] -------------------------------------
              [  373.463980] kworker/1:1/30 is trying to release lock (smc_create_lgr_pending) at:
              [  373.463990] [<000003ff801205fc>] smc_listen_work+0x22c/0x5d0 [smc]
              [  373.463991] but there are no more locks to release!
              [  373.463991]
              other info that might help us debug this:
              [  373.463993] 2 locks held by kworker/1:1/30:
              [  373.463994]  #0: 00000000772cbaed ((wq_completion)"events"){+.+.}, at: process_one_work+0x1ec/0x6b0
              [  373.464000]  #1: 000000003ad0894a ((work_completion)(&new_smc->smc_listen_work)){+.+.}, at: process_one_work+0x1ec/0x6b0
              [  373.464003]
              stack backtrace:
              [  373.464005] CPU: 1 PID: 30 Comm: kworker/1:1 Kdump: loaded Tainted: G           O      4.18.0-rc7uschi+ #123
              [  373.464007] Hardware name: IBM 2827 H43 738 (LPAR)
              [  373.464010] Workqueue: events smc_listen_work [smc]
              [  373.464011] Call Trace:
              [  373.464015] ([<0000000000114100>] show_stack+0x60/0xd8)
              [  373.464019]  [<0000000000a8c9bc>] dump_stack+0x9c/0xd8
              [  373.464021]  [<00000000001dcaf8>] print_unlock_imbalance_bug+0xf8/0x108
              [  373.464022]  [<00000000001e045c>] lock_release+0x114/0x4f8
              [  373.464025]  [<0000000000aa87fa>] __mutex_unlock_slowpath+0x4a/0x300
              [  373.464027]  [<000003ff801205fc>] smc_listen_work+0x22c/0x5d0 [smc]
              [  373.464029]  [<0000000000197a68>] process_one_work+0x2a8/0x6b0
              [  373.464030]  [<0000000000197ec2>] worker_thread+0x52/0x410
              [  373.464033]  [<000000000019fd0e>] kthread+0x15e/0x178
              [  373.464035]  [<0000000000aaf58a>] kernel_thread_starter+0x6/0xc
              [  373.464052]  [<0000000000aaf584>] kernel_thread_starter+0x0/0xc
              [  373.464054] INFO: lockdep is turned off.

              Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
              Signed-off-by: David S. Miller <davem@davemloft.net>


Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Acked-by: Petr Tesarik <ptesarik@suse.com>
---
 net/smc/af_smc.c |   15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1153,9 +1153,9 @@ static int smc_listen_rdma_reg(struct sm
 }
 
 /* listen worker: finish RDMA setup */
-static void smc_listen_rdma_finish(struct smc_sock *new_smc,
-				   struct smc_clc_msg_accept_confirm *cclc,
-				   int local_contact)
+static int smc_listen_rdma_finish(struct smc_sock *new_smc,
+				  struct smc_clc_msg_accept_confirm *cclc,
+				  int local_contact)
 {
 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 	int reason_code = 0;
@@ -1178,11 +1178,12 @@ static void smc_listen_rdma_finish(struc
 		if (reason_code)
 			goto decline;
 	}
-	return;
+	return 0;
 
 decline:
 	mutex_unlock(&smc_create_lgr_pending);
 	smc_listen_decline(new_smc, reason_code, local_contact);
+	return reason_code;
 }
 
 /* setup for RDMA connection of server */
@@ -1279,8 +1280,10 @@ static void smc_listen_work(struct work_
 	}
 
 	/* finish worker */
-	if (!ism_supported)
-		smc_listen_rdma_finish(new_smc, &cclc, local_contact);
+	if (!ism_supported) {
+		if (smc_listen_rdma_finish(new_smc, &cclc, local_contact))
+			return;
+	}
 	smc_conn_save_peer_info(new_smc, &cclc);
 	mutex_unlock(&smc_create_lgr_pending);
 	smc_listen_out_connected(new_smc);