Blob Blame History Raw
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:29 +0100
Subject: x86/entry/64: Make cpu_entry_area.tss read-only
Git-commit: c482feefe1aeb150156248ba0fd3e029bc886605
Patch-mainline: v4.15-rc5
References: bsc#1068032 CVE-2017-5754

The TSS is a fairly juicy target for exploits, and, now that the TSS
is in the cpu_entry_area, it's no longer protected by kASLR.  Make it
read-only on x86_64.

On x86_32, it can't be RO because it's written by the CPU during task
switches, and we use a task gate for double faults.  I'd also be
nervous about errata if we tried to make it RO even on configurations
without double fault handling.

[ tglx: AMD confirmed that there is no problem on 64-bit with TSS RO.  So
  	it's probably safe to assume that it's a non issue, though Intel
  	might have been creative in that area. Still waiting for
  	confirmation. ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bpetkov@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.733700132@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/entry/entry_32.S          |    4 ++--
 arch/x86/entry/entry_64.S          |    8 ++++----
 arch/x86/include/asm/fixmap.h      |   13 +++++++++----
 arch/x86/include/asm/processor.h   |   17 ++++++++---------
 arch/x86/include/asm/switch_to.h   |    4 ++--
 arch/x86/include/asm/thread_info.h |    2 +-
 arch/x86/kernel/asm-offsets.c      |    5 ++---
 arch/x86/kernel/asm-offsets_32.c   |    4 ++--
 arch/x86/kernel/cpu/common.c       |   29 +++++++++++++++++++----------
 arch/x86/kernel/ioport.c           |    2 +-
 arch/x86/kernel/process.c          |    6 +++---
 arch/x86/kernel/process_32.c       |    2 +-
 arch/x86/kernel/process_64.c       |    2 +-
 arch/x86/kernel/traps.c            |    4 ++--
 arch/x86/lib/delay.c               |    4 ++--
 arch/x86/xen/enlighten_pv.c        |    2 +-
 16 files changed, 60 insertions(+), 48 deletions(-)

--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -957,7 +957,7 @@ ENTRY(debug)
 
 	/* Are we currently on the SYSENTER stack? */
 	movl	PER_CPU_VAR(cpu_entry_area), %ecx
-	addl	$CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+	addl	$CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
 	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
 	cmpl	$SIZEOF_SYSENTER_stack, %ecx
 	jb	.Ldebug_from_sysenter_stack
@@ -1001,7 +1001,7 @@ ENTRY(nmi)
 
 	/* Are we currently on the SYSENTER stack? */
 	movl	PER_CPU_VAR(cpu_entry_area), %ecx
-	addl	$CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+	addl	$CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
 	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
 	cmpl	$SIZEOF_SYSENTER_stack, %ecx
 	jb	.Lnmi_from_sysenter_stack
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -153,7 +153,7 @@ END(native_usergs_sysret64)
 	_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
 
 /* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH	CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \
+#define RSP_SCRATCH	CPU_ENTRY_AREA_SYSENTER_stack + \
 			SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
 
 ENTRY(entry_SYSCALL_64_trampoline)
@@ -388,7 +388,7 @@ syscall_return_via_sysret:
 	 * Save old stack pointer and switch to trampoline stack.
 	 */
 	movq	%rsp, %rdi
-	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
 
 	pushq	RSP-RDI(%rdi)	/* RSP */
 	pushq	(%rdi)		/* RDI */
@@ -717,7 +717,7 @@ GLOBAL(swapgs_restore_regs_and_return_to
 	 * Save old stack pointer and switch to trampoline stack.
 	 */
 	movq	%rsp, %rdi
-	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
 
 	/* Copy the IRET frame to the trampoline stack. */
 	pushq	6*8(%rdi)	/* SS */
@@ -949,7 +949,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
 /*
  * Exception entry points.
  */
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
 
 /*
  * Switch to the thread stack.  This is called with the IRET frame and
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -56,9 +56,14 @@ struct cpu_entry_area {
 	char gdt[PAGE_SIZE];
 
 	/*
-	 * The GDT is just below cpu_tss and thus serves (on x86_64) as a
-	 * a read-only guard page for the SYSENTER stack at the bottom
-	 * of the TSS region.
+	 * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
+	 * a a read-only guard page.
+	 */
+	struct SYSENTER_stack_page SYSENTER_stack_page;
+
+	/*
+	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
+	 * we need task switches to work, and task switches write to the TSS.
 	 */
 	struct tss_struct tss;
 
@@ -241,7 +246,7 @@ static inline struct cpu_entry_area *get
 
 static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
 {
-	return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack;
+	return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
 }
 
 #endif /* !__ASSEMBLY__ */
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -339,13 +339,11 @@ struct SYSENTER_stack {
 	unsigned long		words[64];
 };
 
-struct tss_struct {
-	/*
-	 * Space for the temporary SYSENTER stack, used for SYSENTER
-	 * and the entry trampoline as well.
-	 */
-	struct SYSENTER_stack	SYSENTER_stack;
+struct SYSENTER_stack_page {
+	struct SYSENTER_stack stack;
+} __aligned(PAGE_SIZE);
 
+struct tss_struct {
 	/*
 	 * The fixed hardware portion.  This must not cross a page boundary
 	 * at risk of violating the SDM's advice and potentially triggering
@@ -362,7 +360,7 @@ struct tss_struct {
 	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
 } __aligned(PAGE_SIZE);
 
-DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
 
 /*
  * sizeof(unsigned long) coming from an extra "long" at the end
@@ -377,7 +375,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
 #else
-#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1
+/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
+#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
 #endif
 
 /*
@@ -537,7 +536,7 @@ static inline void native_set_iopl_mask(
 static inline void
 native_load_sp0(unsigned long sp0)
 {
-	this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+	this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
 }
 
 static inline void native_swapgs(void)
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -78,10 +78,10 @@ do {									\
 static inline void refresh_sysenter_cs(struct thread_struct *thread)
 {
 	/* Only happens when SEP is enabled, no need to test "SEP"arately: */
-	if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+	if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
 		return;
 
-	this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+	this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
 	wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
 }
 #endif
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -203,7 +203,7 @@ static inline int arch_within_stack_fram
 #else /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss + TSS_sp1)
+# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
 #endif
 
 #endif
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -49,8 +49,8 @@ void foo(void)
 	BLANK();
 
 	/* Offset from the sysenter stack to tss.sp0 */
-	DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
-	       offsetofend(struct tss_struct, SYSENTER_stack));
+	DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
+	       offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 	BLANK();
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -93,10 +93,9 @@ void common(void) {
 	BLANK();
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
 
-	OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack);
-	DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
-
 	/* Layout info for cpu_entry_area */
 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
 	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+	OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
+	DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
 }
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -469,6 +469,9 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char,
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 #endif
 
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
+				   SYSENTER_stack_storage);
+
 static void __init
 set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
 {
@@ -482,23 +485,29 @@ static void __init setup_cpu_entry_area(
 #ifdef CONFIG_X86_64
 	extern char _entry_trampoline[];
 
-	/* On 64-bit systems, we use a read-only fixmap GDT. */
+	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
 	pgprot_t gdt_prot = PAGE_KERNEL_RO;
+	pgprot_t tss_prot = PAGE_KERNEL_RO;
 #else
 	/*
 	 * On native 32-bit systems, the GDT cannot be read-only because
 	 * our double fault handler uses a task gate, and entering through
-	 * a task gate needs to change an available TSS to busy.  If the GDT
-	 * is read-only, that will triple fault.
+	 * a task gate needs to change an available TSS to busy.  If the
+	 * GDT is read-only, that will triple fault.  The TSS cannot be
+	 * read-only because the CPU writes to it on task switches.
 	 *
-	 * On Xen PV, the GDT must be read-only because the hypervisor requires
-	 * it.
+	 * On Xen PV, the GDT must be read-only because the hypervisor
+	 * requires it.
 	 */
 	pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
 		PAGE_KERNEL_RO : PAGE_KERNEL;
+	pgprot_t tss_prot = PAGE_KERNEL;
 #endif
 
 	__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
+	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
+				per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
+				PAGE_KERNEL);
 
 	/*
 	 * The Intel SDM says (Volume 3, 7.2.1):
@@ -521,9 +530,9 @@ static void __init setup_cpu_entry_area(
 		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
 	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
 	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
-				&per_cpu(cpu_tss, cpu),
+				&per_cpu(cpu_tss_rw, cpu),
 				sizeof(struct tss_struct) / PAGE_SIZE,
-				PAGE_KERNEL);
+				tss_prot);
 
 #ifdef CONFIG_X86_32
 	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
@@ -1279,7 +1288,7 @@ void enable_sep_cpu(void)
 		return;
 
 	cpu = get_cpu();
-	tss = &per_cpu(cpu_tss, cpu);
+	tss = &per_cpu(cpu_tss_rw, cpu);
 
 	/*
 	 * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@@ -1560,7 +1569,7 @@ void cpu_init(void)
 	if (cpu)
 		load_ucode_ap();
 
-	t = &per_cpu(cpu_tss, cpu);
+	t = &per_cpu(cpu_tss_rw, cpu);
 	oist = &per_cpu(orig_ist, cpu);
 
 #ifdef CONFIG_NUMA
@@ -1651,7 +1660,7 @@ void cpu_init(void)
 {
 	int cpu = smp_processor_id();
 	struct task_struct *curr = current;
-	struct tss_struct *t = &per_cpu(cpu_tss, cpu);
+	struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
 
 	wait_for_master_cpu(cpu);
 
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -66,7 +66,7 @@ asmlinkage long sys_ioperm(unsigned long
 	 * because the ->io_bitmap_max value must match the bitmap
 	 * contents:
 	 */
-	tss = &per_cpu(cpu_tss, get_cpu());
+	tss = &per_cpu(cpu_tss_rw, get_cpu());
 
 	if (turn_on)
 		bitmap_clear(t->io_bitmap_ptr, from, num);
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p,
 	struct fpu *prev_fpu = &prev->fpu;
 	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
-	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+	struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -399,7 +399,7 @@ __switch_to(struct task_struct *prev_p,
 	struct fpu *prev_fpu = &prev->fpu;
 	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
-	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+	struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
 
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 		     this_cpu_read(irq_count) != -1);
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -46,7 +46,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
 	.x86_tss = {
 		/*
 		 * .sp0 is only used when entering ring 0 from a lower
@@ -81,7 +81,7 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(st
 	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },
 #endif
 };
-EXPORT_PER_CPU_SYMBOL(cpu_tss);
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
 
 DEFINE_PER_CPU(bool, __tss_limit_invalid);
 EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@@ -110,7 +110,7 @@ void exit_thread(struct task_struct *tsk
 	struct fpu *fpu = &t->fpu;
 
 	if (bp) {
-		struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
+		struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
 
 		t->io_bitmap_ptr = NULL;
 		clear_thread_flag(TIF_IO_BITMAP);
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -372,7 +372,7 @@ dotraplinkage void do_double_fault(struc
 		regs->cs == __KERNEL_CS &&
 		regs->ip == (unsigned long)native_irq_return_iret)
 	{
-		struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
+		struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 
 		/*
 		 * regs->sp points to the failing IRET frame on the
@@ -657,7 +657,7 @@ struct bad_iret_stack *fixup_bad_iret(st
 	 * exception came from the IRET target.
 	 */
 	struct bad_iret_stack *new_stack =
-		(struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
+		(struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 
 	/* Copy the IRET target to the new stack. */
 	memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -106,10 +106,10 @@ static void delay_mwaitx(unsigned long _
 		delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
 
 		/*
-		 * Use cpu_tss as a cacheline-aligned, seldomly
+		 * Use cpu_tss_rw as a cacheline-aligned, seldomly
 		 * accessed per-cpu variable as the monitor target.
 		 */
-		__monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
+		__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
 
 		/*
 		 * AMD, like Intel, supports the EAX hint and EAX=0xf
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -840,7 +840,7 @@ static void xen_load_sp0(unsigned long s
 	mcs = xen_mc_entry(0);
 	MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
-	this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+	this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
 }
 
 void xen_set_iopl_mask(unsigned mask)