|
Takashi Iwai |
ff3ce0 |
From 20f3337d350c4e1b4ac66d731fd4e98565bf6cc0 Mon Sep 17 00:00:00 2001
|
|
Takashi Iwai |
ff3ce0 |
From: Linus Torvalds <torvalds@linux-foundation.org>
|
|
Takashi Iwai |
ff3ce0 |
Date: Sat, 15 Apr 2023 12:01:14 -0700
|
|
Takashi Iwai |
ff3ce0 |
Subject: [PATCH] x86: don't use REP_GOOD or ERMS for small memory clearing
|
|
Takashi Iwai |
ff3ce0 |
Git-commit: 20f3337d350c4e1b4ac66d731fd4e98565bf6cc0
|
|
Takashi Iwai |
ff3ce0 |
Patch-mainline: v6.4-rc1
|
|
Takashi Iwai |
ff3ce0 |
References: bsc#1211140
|
|
Takashi Iwai |
ff3ce0 |
|
|
Takashi Iwai |
ff3ce0 |
The modern target to use is FSRS (Fast Short REP STOS), and the other
|
|
Takashi Iwai |
ff3ce0 |
cases should only be used for bigger areas (ie mainly things like page
|
|
Takashi Iwai |
ff3ce0 |
clearing).
|
|
Takashi Iwai |
ff3ce0 |
|
|
Takashi Iwai |
ff3ce0 |
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
|
|
Takashi Iwai |
ff3ce0 |
Acked-by: Takashi Iwai <tiwai@suse.de>
|
|
Takashi Iwai |
ff3ce0 |
|
|
Takashi Iwai |
ff3ce0 |
---
|
|
Takashi Iwai |
ff3ce0 |
arch/x86/lib/memset_64.S | 47 +++++++++++------------------------------------
|
|
Takashi Iwai |
ff3ce0 |
1 file changed, 11 insertions(+), 36 deletions(-)
|
|
Takashi Iwai |
ff3ce0 |
|
|
Takashi Iwai |
ff3ce0 |
--- a/arch/x86/lib/memset_64.S
|
|
Takashi Iwai |
ff3ce0 |
+++ b/arch/x86/lib/memset_64.S
|
|
Takashi Iwai |
ff3ce0 |
@@ -16,28 +16,23 @@
|
|
Takashi Iwai |
ff3ce0 |
* rdx count (bytes)
|
|
Takashi Iwai |
ff3ce0 |
*
|
|
Takashi Iwai |
ff3ce0 |
* rax original destination
|
|
Takashi Iwai |
ff3ce0 |
+ *
|
|
Takashi Iwai |
ff3ce0 |
+ * The FSRS alternative should be done inline (avoiding the call and
|
|
Takashi Iwai |
ff3ce0 |
+ * the disgusting return handling), but that would require some help
|
|
Takashi Iwai |
ff3ce0 |
+ * from the compiler for better calling conventions.
|
|
Takashi Iwai |
ff3ce0 |
+ *
|
|
Takashi Iwai |
ff3ce0 |
+ * The 'rep stosb' itself is small enough to replace the call, but all
|
|
Takashi Iwai |
ff3ce0 |
+ * the register moves blow up the code. And two of them are "needed"
|
|
Takashi Iwai |
ff3ce0 |
+ * only for the return value that is the same as the source input,
|
|
Takashi Iwai |
ff3ce0 |
+ * which the compiler could/should do much better anyway.
|
|
Takashi Iwai |
ff3ce0 |
*/
|
|
Takashi Iwai |
ff3ce0 |
SYM_FUNC_START_WEAK(memset)
|
|
Takashi Iwai |
ff3ce0 |
SYM_FUNC_START(__memset)
|
|
Takashi Iwai |
ff3ce0 |
- /*
|
|
Takashi Iwai |
ff3ce0 |
- * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
|
|
Takashi Iwai |
ff3ce0 |
- * to use it when possible. If not available, use fast string instructions.
|
|
Takashi Iwai |
ff3ce0 |
- *
|
|
Takashi Iwai |
ff3ce0 |
- * Otherwise, use original memset function.
|
|
Takashi Iwai |
ff3ce0 |
- */
|
|
Takashi Iwai |
ff3ce0 |
- ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
|
|
Takashi Iwai |
ff3ce0 |
- "jmp memset_erms", X86_FEATURE_ERMS
|
|
Takashi Iwai |
ff3ce0 |
+ ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
|
|
Takashi Iwai |
ff3ce0 |
|
|
Takashi Iwai |
ff3ce0 |
movq %rdi,%r9
|
|
Takashi Iwai |
ff3ce0 |
+ movb %sil,%al
|
|
Takashi Iwai |
ff3ce0 |
movq %rdx,%rcx
|
|
Takashi Iwai |
ff3ce0 |
- andl $7,%edx
|
|
Takashi Iwai |
ff3ce0 |
- shrq $3,%rcx
|
|
Takashi Iwai |
ff3ce0 |
- /* expand byte value */
|
|
Takashi Iwai |
ff3ce0 |
- movzbl %sil,%esi
|
|
Takashi Iwai |
ff3ce0 |
- movabs $0x0101010101010101,%rax
|
|
Takashi Iwai |
ff3ce0 |
- imulq %rsi,%rax
|
|
Takashi Iwai |
ff3ce0 |
- rep stosq
|
|
Takashi Iwai |
ff3ce0 |
- movl %edx,%ecx
|
|
Takashi Iwai |
ff3ce0 |
rep stosb
|
|
Takashi Iwai |
ff3ce0 |
movq %r9,%rax
|
|
Takashi Iwai |
ff3ce0 |
RET
|
|
Takashi Iwai |
ff3ce0 |
@@ -46,26 +41,6 @@ SYM_FUNC_END_ALIAS(memset)
|
|
Takashi Iwai |
ff3ce0 |
EXPORT_SYMBOL(memset)
|
|
Takashi Iwai |
ff3ce0 |
EXPORT_SYMBOL(__memset)
|
|
Takashi Iwai |
ff3ce0 |
|
|
Takashi Iwai |
ff3ce0 |
-/*
|
|
Takashi Iwai |
ff3ce0 |
- * ISO C memset - set a memory block to a byte value. This function uses
|
|
Takashi Iwai |
ff3ce0 |
- * enhanced rep stosb to override the fast string function.
|
|
Takashi Iwai |
ff3ce0 |
- * The code is simpler and shorter than the fast string function as well.
|
|
Takashi Iwai |
ff3ce0 |
- *
|
|
Takashi Iwai |
ff3ce0 |
- * rdi destination
|
|
Takashi Iwai |
ff3ce0 |
- * rsi value (char)
|
|
Takashi Iwai |
ff3ce0 |
- * rdx count (bytes)
|
|
Takashi Iwai |
ff3ce0 |
- *
|
|
Takashi Iwai |
ff3ce0 |
- * rax original destination
|
|
Takashi Iwai |
ff3ce0 |
- */
|
|
Takashi Iwai |
ff3ce0 |
-SYM_FUNC_START_LOCAL(memset_erms)
|
|
Takashi Iwai |
ff3ce0 |
- movq %rdi,%r9
|
|
Takashi Iwai |
ff3ce0 |
- movb %sil,%al
|
|
Takashi Iwai |
ff3ce0 |
- movq %rdx,%rcx
|
|
Takashi Iwai |
ff3ce0 |
- rep stosb
|
|
Takashi Iwai |
ff3ce0 |
- movq %r9,%rax
|
|
Takashi Iwai |
ff3ce0 |
- RET
|
|
Takashi Iwai |
ff3ce0 |
-SYM_FUNC_END(memset_erms)
|
|
Takashi Iwai |
ff3ce0 |
-
|
|
Takashi Iwai |
ff3ce0 |
SYM_FUNC_START_LOCAL(memset_orig)
|
|
Takashi Iwai |
ff3ce0 |
movq %rdi,%r10
|
|
Takashi Iwai |
ff3ce0 |
|