Blob Blame History Raw
From: Jeff Mahoney <jeffm@suse.com>
Subject: mdraid: fix read/write bytes accounting
Patch-mainline: Submitted to linux-raid, 5 Jun 2020
References: bsc#1172537

The i/o accounting published in /proc/diskstats for mdraid is currently
broken.  md_make_request does the accounting for every bio passed but
when a bio needs to be split, all the split bios are also submitted
through md_make_request, resulting in multiple accounting.

As a result, a quick test on a RAID6 volume displayed the following
behavior:

# dd if=/dev/md5 seek=1024 bs=512 count=128k of=/dev/null iflag=direct
131072+0 records in
131072+0 records out
67108864 bytes (67 MB, 64 MiB) copied, 13.9227 s, 4.8 MB/s

... shows 131072 sectors read -- 64 MiB.  Correct.

# dd if=/dev/md5 seek=1024 bs=128k count=512 of=/dev/null iflag=direct
512+0 records in
512+0 records out
67108864 bytes (67 MB, 64 MiB) copied, 0.287365 s, 234 MB/s

... shows 196608 sectors read -- 96 MiB.  With a 64k stripe size, we're
seeing some splitting.

# dd if=/dev/md5 seek=1024 bs=1M count=512 of=/dev/null iflag=direct
512+0 records in
512+0 records out
536870912 bytes (537 MB, 512 MiB) copied, 0.705004 s, 762 MB/s

... shows 4718624 sectors read -- 2.25 GiB transferred.  Lots of splitting
and a stats explosion.

The fix is to push the accounting into the personality make_request
callbacks so that only the bios actually submitted for I/O will be
accounted.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
---
 drivers/md/md-faulty.c    |    1 +
 drivers/md/md-linear.c    |    1 +
 drivers/md/md-multipath.c |    1 +
 drivers/md/md.c           |   25 +++++++++++++++++++------
 drivers/md/md.h           |    1 +
 drivers/md/raid0.c        |    1 +
 drivers/md/raid1.c        |    3 +++
 drivers/md/raid10.c       |    3 +++
 drivers/md/raid5.c        |    3 +++
 9 files changed, 33 insertions(+), 6 deletions(-)

--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -223,6 +223,7 @@ static bool faulty_make_request(struct m
 	} else
 		bio_set_dev(bio, conf->rdev->bdev);
 
+	md_io_acct(mddev, bio_data_dir(bio), bio_sectors(bio));
 	generic_make_request(bio);
 	return true;
 }
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -278,6 +278,7 @@ static bool linear_make_request(struct m
 		bio = split;
 	}
 
+	md_io_acct(mddev, bio_data_dir(bio), bio_sectors(bio));
 	bio_set_dev(bio, tmp_dev->rdev->bdev);
 	bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
 		start_sector + data_offset;
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -139,6 +139,7 @@ static bool multipath_make_request(struc
 	mp_bh->bio.bi_private = mp_bh;
 	mddev_check_writesame(mddev, &mp_bh->bio);
 	mddev_check_write_zeroes(mddev, &mp_bh->bio);
+	md_io_acct(mddev, bio_data_dir(bio), bio_sectors(bio));
 	generic_make_request(&mp_bh->bio);
 	return true;
 }
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -377,12 +377,30 @@ check_suspended:
 }
 EXPORT_SYMBOL(md_handle_request);
 
+/*
+ * This is generic_start_io_acct without the inflight tracking.  Since
+ * we can't reliably call generic_end_io_acct, the inflight counter
+ * would also be unreliable and it's better to keep it at 0.
+ */
+void md_io_acct(struct mddev *mddev, int rw, unsigned int sectors)
+{
+	int cpu;
+
+	if (!mddev->gendisk)
+		return;
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+	part_stat_unlock();
+
+}
+EXPORT_SYMBOL_GPL(md_io_acct);
+
 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int rw = bio_data_dir(bio);
 	struct mddev *mddev = q->queuedata;
 	unsigned int sectors;
-	int cpu;
 
 	if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
 		bio_io_error(bio);
@@ -412,11 +430,6 @@ static blk_qc_t md_make_request(struct r
 
 	md_handle_request(mddev, bio);
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
-	part_stat_unlock();
-
 	return BLK_QC_T_NONE;
 }
 
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -745,6 +745,7 @@ extern void mddev_suspend(struct mddev *
 extern void mddev_resume(struct mddev *mddev);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   struct mddev *mddev);
+extern void md_io_acct(struct mddev *mddev, int rw, unsigned int sectors);
 
 extern void md_reload_sb(struct mddev *mddev, int raid_disk);
 extern void md_update_sb(struct mddev *mddev, int force);
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -637,6 +637,7 @@ static bool raid0_make_request(struct md
 				disk_devt(mddev->gendisk), bio_sector);
 	mddev_check_writesame(mddev, bio);
 	mddev_check_write_zeroes(mddev, bio);
+	md_io_acct(mddev, bio_data_dir(bio), bio_sectors(bio));
 	generic_make_request(bio);
 	return true;
 }
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1340,6 +1340,7 @@ static void raid1_read_request(struct md
 		r1_bio->sectors = max_sectors;
 	}
 
+	md_io_acct(mddev, bio_data_dir(bio), bio_sectors(bio));
 	r1_bio->read_disk = rdisk;
 
 	read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
@@ -1511,6 +1512,8 @@ static void raid1_write_request(struct m
 		r1_bio->sectors = max_sectors;
 	}
 
+	md_io_acct(mddev, bio_data_dir(bio), bio_sectors(bio));
+
 	atomic_set(&r1_bio->remaining, 1);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1253,6 +1253,8 @@ static void raid10_read_request(struct m
 	}
 	slot = r10_bio->read_slot;
 
+	md_io_acct(mddev, bio_data_dir(bio), bio_sectors(bio));
+
 	read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
 
 	r10_bio->devs[slot].bio = read_bio;
@@ -1536,6 +1538,7 @@ retry_write:
 		r10_bio->master_bio = bio;
 	}
 
+	md_io_acct(mddev, bio_data_dir(bio), bio_sectors(bio));
 	atomic_set(&r10_bio->remaining, 1);
 	md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
 
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5321,6 +5321,7 @@ static struct bio *chunk_aligned_read(st
 	if (!raid5_read_one_chunk(mddev, raid_bio))
 		return raid_bio;
 
+	md_io_acct(mddev, bio_data_dir(raid_bio), bio_sectors(raid_bio));
 	return NULL;
 }
 
@@ -5639,6 +5640,8 @@ static bool raid5_make_request(struct md
 	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
 
+	md_io_acct(mddev, bio_data_dir(bi), bio_sectors(bi));
+
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		int previous;