Blob Blame History Raw
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 29 Apr 2020 12:30:03 -0400
Subject: [PATCH] dm writecache: improve performance on DDR persistent memory
 (Optane)
Git-commit: 48338daaa00e6137a43fa5d0e54b763aa34f450b
Patch-mainline: v5.8-rc1
References: bsc#1175995,jsc#SLE-15608

When testing the dm-writecache target on a real DDR persistent memory
(Intel Optane), it turned out that explicit cache flushing using the
clflushopt instruction performs better than non-temporal stores for
block sizes 1k, 2k and 4k.

The dm-writecache target is singlethreaded (all the copying is done
while holding the writecache lock), so it benefits from clwb, see:
http://lore.kernel.org/r/alpine.LRH.2.02.2004160411460.7833@file01.intranet.prod.int.rdu2.redhat.com

Add a new function memcpy_flushcache_optimized() that tests if
clflushopt is present - and if it is, we use it instead of
memcpy_flushcache.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Hannes Reinecke <hare@suse.com>
---
 drivers/md/dm-writecache.c | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index d29d3e234e01..74f3c506f084 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -1139,6 +1139,42 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
 	return r;
 }
 
+static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
+{
+	/*
+	 * clflushopt performs better with block size 1024, 2048, 4096
+	 * non-temporal stores perform better with block size 512
+	 *
+	 * block size   512             1024            2048            4096
+	 * movnti       496 MB/s        642 MB/s        725 MB/s        744 MB/s
+	 * clflushopt   373 MB/s        688 MB/s        1.1 GB/s        1.2 GB/s
+	 *
+	 * We see that movnti performs better for 512-byte blocks, and
+	 * clflushopt performs better for 1024-byte and larger blocks. So, we
+	 * prefer clflushopt for sizes >= 768.
+	 *
+	 * NOTE: this happens to be the case now (with dm-writecache's single
+	 * threaded model) but re-evaluate this once memcpy_flushcache() is
+	 * enabled to use movdir64b which might invalidate this performance
+	 * advantage seen with cache-allocating-writes plus flushing.
+	 */
+#ifdef CONFIG_X86
+	if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
+	    likely(boot_cpu_data.x86_clflush_size == 64) &&
+	    likely(size >= 768)) {
+		do {
+			memcpy((void *)dest, (void *)source, 64);
+			clflushopt((void *)dest);
+			dest += 64;
+			source += 64;
+			size -= 64;
+		} while (size >= 64);
+		return;
+	}
+#endif
+	memcpy_flushcache(dest, source, size);
+}
+
 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
 {
 	void *buf;
@@ -1164,7 +1200,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data
 			}
 		} else {
 			flush_dcache_page(bio_page(bio));
-			memcpy_flushcache(data, buf, size);
+			memcpy_flushcache_optimized(data, buf, size);
 		}
 
 		bvec_kunmap_irq(buf, &flags);
-- 
2.16.4