Jiri Slaby ef7db2
From: farah kassabri <fkassabri@habana.ai>
Jiri Slaby ef7db2
Date: Tue, 10 Jan 2023 12:29:55 +0200
Jiri Slaby ef7db2
Subject: [PATCH] habanalabs: fix bug in timestamps registration code
Jiri Slaby ef7db2
References: bsc#1012628
Jiri Slaby ef7db2
Patch-mainline: 6.2.3
Jiri Slaby ef7db2
Git-commit: ac5af9900f82b7034de7c9eb1d70d030ba325607
Jiri Slaby ef7db2
Jiri Slaby ef7db2
[ Upstream commit ac5af9900f82b7034de7c9eb1d70d030ba325607 ]
Jiri Slaby ef7db2
Jiri Slaby ef7db2
Protect re-using the same timestamp buffer record before actually
Jiri Slaby ef7db2
adding it to the to interrupt wait list.
Jiri Slaby ef7db2
Mark ts buff offset as in use in the spinlock protection area of the
Jiri Slaby ef7db2
interrupt wait list to avoid getting in the re-use section in
Jiri Slaby ef7db2
ts_buff_get_kernel_ts_record before adding the node to the list.
Jiri Slaby ef7db2
this scenario might happen when multiple threads are racing on
Jiri Slaby ef7db2
same offset and one thread could set data in the ts buff in
Jiri Slaby ef7db2
ts_buff_get_kernel_ts_record then the other thread takes over
Jiri Slaby ef7db2
and get to ts_buff_get_kernel_ts_record and we will try
Jiri Slaby ef7db2
to re-use the same ts buff offset then we will try to
Jiri Slaby ef7db2
delete a non existing node from the list.
Jiri Slaby ef7db2
Jiri Slaby ef7db2
Signed-off-by: farah kassabri <fkassabri@habana.ai>
Jiri Slaby ef7db2
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Jiri Slaby ef7db2
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Jiri Slaby ef7db2
Signed-off-by: Sasha Levin <sashal@kernel.org>
Jiri Slaby ef7db2
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Jiri Slaby ef7db2
---
Jiri Slaby ef7db2
 .../habanalabs/common/command_submission.c    | 33 ++++++++++++-------
Jiri Slaby ef7db2
 1 file changed, 22 insertions(+), 11 deletions(-)
Jiri Slaby ef7db2
Jiri Slaby ef7db2
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
Jiri Slaby ef7db2
index ea0e5101..6367cbea 100644
Jiri Slaby ef7db2
--- a/drivers/misc/habanalabs/common/command_submission.c
Jiri Slaby ef7db2
+++ b/drivers/misc/habanalabs/common/command_submission.c
Jiri Slaby ef7db2
@@ -3119,19 +3119,18 @@ static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf,
Jiri Slaby ef7db2
 			goto start_over;
Jiri Slaby ef7db2
 		}
Jiri Slaby ef7db2
 	} else {
Jiri Slaby ef7db2
+		/* Fill up the new registration node info */
Jiri Slaby ef7db2
+		requested_offset_record->ts_reg_info.buf = buf;
Jiri Slaby ef7db2
+		requested_offset_record->ts_reg_info.cq_cb = cq_cb;
Jiri Slaby ef7db2
+		requested_offset_record->ts_reg_info.timestamp_kernel_addr =
Jiri Slaby ef7db2
+				(u64 *) ts_buff->user_buff_address + ts_offset;
Jiri Slaby ef7db2
+		requested_offset_record->cq_kernel_addr =
Jiri Slaby ef7db2
+				(u64 *) cq_cb->kernel_address + cq_offset;
Jiri Slaby ef7db2
+		requested_offset_record->cq_target_value = target_value;
Jiri Slaby ef7db2
+
Jiri Slaby ef7db2
 		spin_unlock_irqrestore(wait_list_lock, flags);
Jiri Slaby ef7db2
 	}
Jiri Slaby ef7db2
 
Jiri Slaby ef7db2
-	/* Fill up the new registration node info */
Jiri Slaby ef7db2
-	requested_offset_record->ts_reg_info.in_use = 1;
Jiri Slaby ef7db2
-	requested_offset_record->ts_reg_info.buf = buf;
Jiri Slaby ef7db2
-	requested_offset_record->ts_reg_info.cq_cb = cq_cb;
Jiri Slaby ef7db2
-	requested_offset_record->ts_reg_info.timestamp_kernel_addr =
Jiri Slaby ef7db2
-			(u64 *) ts_buff->user_buff_address + ts_offset;
Jiri Slaby ef7db2
-	requested_offset_record->cq_kernel_addr =
Jiri Slaby ef7db2
-			(u64 *) cq_cb->kernel_address + cq_offset;
Jiri Slaby ef7db2
-	requested_offset_record->cq_target_value = target_value;
Jiri Slaby ef7db2
-
Jiri Slaby ef7db2
 	*pend = requested_offset_record;
Jiri Slaby ef7db2
 
Jiri Slaby ef7db2
 	dev_dbg(buf->mmg->dev, "Found available node in TS kernel CB %p\n",
Jiri Slaby ef7db2
@@ -3179,7 +3178,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
Jiri Slaby ef7db2
 			goto put_cq_cb;
Jiri Slaby ef7db2
 		}
Jiri Slaby ef7db2
 
Jiri Slaby ef7db2
-		/* Find first available record */
Jiri Slaby ef7db2
+		/* get ts buffer record */
Jiri Slaby ef7db2
 		rc = ts_buff_get_kernel_ts_record(buf, cq_cb, ts_offset,
Jiri Slaby ef7db2
 						cq_counters_offset, target_value,
Jiri Slaby ef7db2
 						&interrupt->wait_list_lock, &pend);
Jiri Slaby ef7db2
@@ -3227,7 +3226,19 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
Jiri Slaby ef7db2
 	 * Note that we cannot have sorted list by target value,
Jiri Slaby ef7db2
 	 * in order to shorten the list pass loop, since
Jiri Slaby ef7db2
 	 * same list could have nodes for different cq counter handle.
Jiri Slaby ef7db2
+	 * Note:
Jiri Slaby ef7db2
+	 * Mark ts buff offset as in use here in the spinlock protection area
Jiri Slaby ef7db2
+	 * to avoid getting in the re-use section in ts_buff_get_kernel_ts_record
Jiri Slaby ef7db2
+	 * before adding the node to the list. this scenario might happen when
Jiri Slaby ef7db2
+	 * multiple threads are racing on same offset and one thread could
Jiri Slaby ef7db2
+	 * set the ts buff in ts_buff_get_kernel_ts_record then the other thread
Jiri Slaby ef7db2
+	 * takes over and get to ts_buff_get_kernel_ts_record and then we will try
Jiri Slaby ef7db2
+	 * to re-use the same ts buff offset, and will try to delete a non existing
Jiri Slaby ef7db2
+	 * node from the list.
Jiri Slaby ef7db2
 	 */
Jiri Slaby ef7db2
+	if (register_ts_record)
Jiri Slaby ef7db2
+		pend->ts_reg_info.in_use = 1;
Jiri Slaby ef7db2
+
Jiri Slaby ef7db2
 	list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
Jiri Slaby ef7db2
 	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
Jiri Slaby ef7db2
 
Jiri Slaby ef7db2
-- 
Jiri Slaby ef7db2
2.35.3
Jiri Slaby ef7db2