Blob Blame History Raw
From 8d355a46c1e0cea59be3ea8395409a5e6eeed946 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Fri, 2 Jun 2023 17:18:38 +0800
Subject: [PATCH] md/raid10: Do not add spare disk when recovery fails
Git-commit: 8d355a46c1e0cea59be3ea8395409a5e6eeed946
Patch-mainline: v6.5-rc1
References: git-fixes

In raid10_sync_request(), if data cannot be read from any disk for
recovery, it will go to 'giveup' and let 'chunks_skipped' + 1. After
multiple 'giveup', when 'chunks_skipped >= geo.raid_disks', it will
return 'max_sector', indicating that the recovery has been completed.
However, the recovery is just aborted and the data remains inconsistent.

Fix it by setting mirror->recovery_disabled, which will prevent the spare
disk from being added to this mirror. The same issue also exists during
resync, it will be fixed afterwards.

Signed-off-by: Li Nan <linan122@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230602091839.743798-2-linan666@huaweicloud.com
Signed-off-by: Coly Li <colyli@suse.de>

---
 drivers/md/raid10.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ab0e2485b2b7..1b953e788ce1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3311,6 +3311,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 	int chunks_skipped = 0;
 	sector_t chunk_mask = conf->geo.chunk_mask;
 	int page_idx = 0;
+	int error_disk = -1;
 
 	/*
 	 * Allow skipping a full rebuild for incremental assembly
@@ -3394,8 +3395,21 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		return reshape_request(mddev, sector_nr, skipped);
 
 	if (chunks_skipped >= conf->geo.raid_disks) {
-		/* if there has been nothing to do on any drive,
-		 * then there is nothing to do at all..
+		pr_err("md/raid10:%s: %s fails\n", mdname(mddev),
+			test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?  "resync" : "recovery");
+		if (error_disk >= 0 &&
+		    !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+			/*
+			 * recovery fails, set mirrors.recovery_disabled,
+			 * device shouldn't be added to there.
+			 */
+			conf->mirrors[error_disk].recovery_disabled =
+						mddev->recovery_disabled;
+			return 0;
+		}
+		/*
+		 * if there has been nothing to do on any drive,
+		 * then there is nothing to do at all.
 		 */
 		*skipped = 1;
 		return (max_sector - sector_nr) + sectors_skipped;
@@ -3646,6 +3660,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 						       mdname(mddev));
 					mirror->recovery_disabled
 						= mddev->recovery_disabled;
+				} else {
+					error_disk = i;
 				}
 				put_buf(r10_bio);
 				if (rb2)
-- 
2.35.3