Blob Blame History Raw
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Subject: kernel: jump label transformation performance

References: bsc#1137534 bsc#1137535 LTC#178058 LTC#178059
Patch-mainline: v5.3-rc1
Git-commit: a646ef398e72a2ac40bea974808ffcf1bea4e7f4

Description:   kernel: jump label transformation performance
Symptom:       Unresponsive systems together with huge amounts of observable
               diagnose 0x44 calls.
Problem:       Jump label instruction patching is done in the context of
               stop_machine_run, which synchronizes all CPUs of a system.
               If this happens concurrently on many virtual systems and the
               sum of all virtual CPUs is (significantly) higher than the
               amount of pyhsical CPUs of the underlying hypervisor, this
               may lead to long delays, when the kernel tries to synchronize
               CPUs. In worst case scenarios this can lead to systems which
               are unresponsive for several minutes.
Solution:      For jump label instruction patching it is not necessary to
               synchronize all CPUs. Instead the mask bits of the used branch
               instruction can simply be overwritten. To make the patched
               instruction visible for all other CPUs in the configuration a
               subsequent signal processor to all other CPUs is sufficient.
Reproduction:  -

Upstream-Description:

              s390/jump_label: replace stop_machine with smp_call_function

              The use of stop_machine to replace the mask bits of the jump label branch
              is a very heavy-weight operation. This is in fact not necessary, the
              mask of the branch can simply be updated, followed by a signal processor
              to all the other CPUs to force them to pick up the modified instruction.

              Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
              [heiko.carstens@de.ibm.com]: Change jump_label_make_nop() so we get
                                           brcl 0,offset instead of brcl 0,0. This
                                           makes sure that only the mask part of the
                                           instruction gets changed when updated.
              Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Michal Suchanek <msuchanek@suse.de>
---
 arch/s390/kernel/jump_label.c |   18 +++++-------------
 arch/s390/mm/maccess.c        |    9 +++++----
 2 files changed, 10 insertions(+), 17 deletions(-)

--- a/arch/s390/kernel/jump_label.c
+++ b/arch/s390/kernel/jump_label.c
@@ -23,9 +23,9 @@ struct insn_args {
 
 static void jump_label_make_nop(struct jump_entry *entry, struct insn *insn)
 {
-	/* brcl 0,0 */
+	/* brcl 0,offset */
 	insn->opcode = 0xc004;
-	insn->offset = 0;
+	insn->offset = (entry->target - entry->code) >> 1;
 }
 
 static void jump_label_make_branch(struct jump_entry *entry, struct insn *insn)
@@ -77,23 +77,15 @@ static void __jump_label_transform(struc
 	s390_kernel_write((void *)entry->code, &new, sizeof(new));
 }
 
-static int __sm_arch_jump_label_transform(void *data)
+static void __jump_label_sync(void *dummy)
 {
-	struct insn_args *args = data;
-
-	__jump_label_transform(args->entry, args->type, 0);
-	return 0;
 }
 
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
 {
-	struct insn_args args;
-
-	args.entry = entry;
-	args.type = type;
-
-	stop_machine_cpuslocked(__sm_arch_jump_label_transform, &args, NULL);
+	__jump_label_transform(entry, type, 0);
+	smp_call_function(__jump_label_sync, NULL, 1);
 }
 
 void arch_jump_label_transform_static(struct jump_entry *entry,
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -50,21 +50,22 @@ static notrace long s390_kernel_write_od
  * Therefore we have a read-modify-write sequence: the function reads eight
  * bytes from destination at an eight byte boundary, modifies the bytes
  * requested and writes the result back in a loop.
- *
- * Note: this means that this function may not be called concurrently on
- *	 several cpus with overlapping words, since this may potentially
- *	 cause data corruption.
  */
+static DEFINE_SPINLOCK(s390_kernel_write_lock);
+
 void notrace s390_kernel_write(void *dst, const void *src, size_t size)
 {
+	unsigned long flags;
 	long copied;
 
+	spin_lock_irqsave(&s390_kernel_write_lock, flags);
 	while (size) {
 		copied = s390_kernel_write_odd(dst, src, size);
 		dst += copied;
 		src += copied;
 		size -= copied;
 	}
+	spin_unlock_irqrestore(&s390_kernel_write_lock, flags);
 }
 
 static int __memcpy_real(void *dest, void *src, size_t count)