From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: Linux: 4.2
Patch-mainline: Never, SUSE-Xen specific
This patch contains the differences between 4.1 and 4.2.
Automatically created from "patch-4.2" by xen-port-patches.py
Acked-by: jbeulich@suse.com
--- a/arch/arm/include/asm/xen/hypervisor.h
+++ b/arch/arm/include/asm/xen/hypervisor.h
@@ -20,7 +20,7 @@ static inline enum paravirt_lazy_mode pa
extern struct dma_map_ops *xen_dma_ops;
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
void __init xen_early_init(void);
#else
static inline void xen_early_init(void) { return; }
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -902,7 +902,7 @@ config NR_CPUS
config SCHED_SMT
bool "SMT (Hyperthreading) scheduler support"
- depends on SMP
+ depends on SMP && !XEN
---help---
SMT scheduler support improves the CPU scheduler's decision making
when dealing with Intel Pentium 4 chips with HyperThreading at a
@@ -912,7 +912,7 @@ config SCHED_SMT
config SCHED_MC
def_bool y
prompt "Multi-core scheduler support"
- depends on SMP
+ depends on SMP && !XEN
---help---
Multi-core scheduler support improves the CPU scheduler's decision
making when dealing with multi-core CPU chips at a cost of slightly
@@ -955,7 +955,7 @@ config X86_LOCAL_APIC
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
depends on !XEN_UNPRIVILEGED_GUEST
select IRQ_DOMAIN_HIERARCHY
- select PCI_MSI_IRQ_DOMAIN if PCI_MSI
+ select PCI_MSI_IRQ_DOMAIN if PCI_MSI && !XEN
config X86_IO_APIC
def_bool y
@@ -1044,6 +1044,7 @@ config X86_LEGACY_VM86
bool "Legacy VM86 support"
default n
depends on X86_32
+ depends on !XEN || (XEN_COMPAT < 0x040300)
---help---
This option allows user programs to put the CPU into V8086
mode, which is an 80286-era approximation of 16-bit real mode.
--- a/arch/x86/configs/xen.config
+++ b/arch/x86/configs/xen.config
@@ -22,7 +22,7 @@ CONFIG_XEN_ACPI_PROCESSOR=m
# x86 specific backend drivers
CONFIG_XEN_PCIDEV_BACKEND=m
# x86 specific frontend drivers
-CONFIG_XEN_PCIDEV_FRONTEND=m
+CONFIG_PARAVIRT_XEN_PCIDEV_FRONTEND=m
# depends on MEMORY_HOTPLUG, arm64 doesn't enable this yet,
# move to generic config if it ever does.
CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y
--- a/arch/x86/entry/entry_32-xen.S
+++ b/arch/x86/entry/entry_32-xen.S
@@ -1,29 +1,18 @@
/*
+ * Copyright (C) 1991,1992 Linus Torvalds
*
- * Copyright (C) 1991, 1992 Linus Torvalds
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- * This also contains the timer-interrupt handler, as well as all interrupts
- * and faults that can result in a task-switch.
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after a timer-interrupt and after each system call.
- *
- * I changed all the .align's to 4 (16 byte alignment), as that's faster
- * on a 486.
+ * entry_32.S contains the system-call and low-level fault and trap handling routines.
*
* Stack layout in 'syscall_exit':
- * ptrace needs to have all regs on the stack.
- * if the order here is changed, it needs to be
- * updated in fork.c:copy_process, signal.c:do_signal,
+ * ptrace needs to have all registers on the stack.
+ * If the order here is changed, it needs to be
+ * updated in fork.c:copy_process(), signal.c:do_signal(),
* ptrace.c and ptrace.h
*
* 0(%esp) - %ebx
* 4(%esp) - %ecx
* 8(%esp) - %edx
- * C(%esp) - %esi
+ * C(%esp) - %esi
* 10(%esp) - %edi
* 14(%esp) - %ebp
* 18(%esp) - %eax
@@ -37,8 +26,6 @@
* 38(%esp) - %eflags
* 3C(%esp) - %oldesp
* 40(%esp) - %oldss
- *
- * "current" is in register %ebx during any slow entries.
*/
#include <linux/linkage.h>
@@ -50,7 +37,6 @@
#include <asm/smp.h>
#include <asm/page_types.h>
#include <asm/percpu.h>
-#include <asm/dwarf2.h>
#include <asm/processor-flags.h>
#include <asm/ftrace.h>
#include <asm/irq_vectors.h>
@@ -63,11 +49,11 @@
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE 0x40000000
+#define __AUDIT_ARCH_LE 0x40000000
#ifndef CONFIG_AUDITSYSCALL
-#define sysenter_audit syscall_trace_entry
-#define sysexit_audit syscall_exit_work
+# define sysenter_audit syscall_trace_entry
+# define sysexit_audit syscall_exit_work
#endif
.section .entry.text, "ax"
@@ -89,16 +75,16 @@
NMI_MASK = 0x80000000
#ifdef CONFIG_PREEMPT
-#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
-#define preempt_stop(clobbers)
-#define resume_kernel restore_all
+# define preempt_stop(clobbers)
+# define resume_kernel restore_all
#endif
.macro TRACE_IRQS_IRET
#ifdef CONFIG_TRACE_IRQFLAGS
- testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
- jz 1f
+ testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off?
+ jz 1f
TRACE_IRQS_ON
1:
#endif
@@ -117,11 +103,10 @@ NMI_MASK = 0x80000000
/* unfortunately push/pop can't be no-op */
.macro PUSH_GS
- pushl_cfi $0
+ pushl $0
.endm
.macro POP_GS pop=0
- addl $(4 + \pop), %esp
- CFI_ADJUST_CFA_OFFSET -(4 + \pop)
+ addl $(4 + \pop), %esp
.endm
.macro POP_GS_EX
.endm
@@ -141,181 +126,119 @@ NMI_MASK = 0x80000000
#else /* CONFIG_X86_32_LAZY_GS */
.macro PUSH_GS
- pushl_cfi %gs
- /*CFI_REL_OFFSET gs, 0*/
+ pushl %gs
.endm
.macro POP_GS pop=0
-98: popl_cfi %gs
- /*CFI_RESTORE gs*/
+98: popl %gs
.if \pop <> 0
- add $\pop, %esp
- CFI_ADJUST_CFA_OFFSET -\pop
+ add $\pop, %esp
.endif
.endm
.macro POP_GS_EX
.pushsection .fixup, "ax"
-99: movl $0, (%esp)
- jmp 98b
+99: movl $0, (%esp)
+ jmp 98b
.popsection
- _ASM_EXTABLE(98b,99b)
+ _ASM_EXTABLE(98b, 99b)
.endm
.macro PTGS_TO_GS
-98: mov PT_GS(%esp), %gs
+98: mov PT_GS(%esp), %gs
.endm
.macro PTGS_TO_GS_EX
.pushsection .fixup, "ax"
-99: movl $0, PT_GS(%esp)
- jmp 98b
+99: movl $0, PT_GS(%esp)
+ jmp 98b
.popsection
- _ASM_EXTABLE(98b,99b)
+ _ASM_EXTABLE(98b, 99b)
.endm
.macro GS_TO_REG reg
- movl %gs, \reg
- /*CFI_REGISTER gs, \reg*/
+ movl %gs, \reg
.endm
.macro REG_TO_PTGS reg
- movl \reg, PT_GS(%esp)
- /*CFI_REL_OFFSET gs, PT_GS*/
+ movl \reg, PT_GS(%esp)
.endm
.macro SET_KERNEL_GS reg
- movl $(__KERNEL_STACK_CANARY), \reg
- movl \reg, %gs
+ movl $(__KERNEL_STACK_CANARY), \reg
+ movl \reg, %gs
.endm
-#endif /* CONFIG_X86_32_LAZY_GS */
+#endif /* CONFIG_X86_32_LAZY_GS */
.macro SAVE_ALL
cld
PUSH_GS
- pushl_cfi %fs
- /*CFI_REL_OFFSET fs, 0;*/
- pushl_cfi %es
- /*CFI_REL_OFFSET es, 0;*/
- pushl_cfi %ds
- /*CFI_REL_OFFSET ds, 0;*/
- pushl_cfi %eax
- CFI_REL_OFFSET eax, 0
- pushl_cfi %ebp
- CFI_REL_OFFSET ebp, 0
- pushl_cfi %edi
- CFI_REL_OFFSET edi, 0
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %edx
- CFI_REL_OFFSET edx, 0
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
- movl $(__USER_DS), %edx
- movl %edx, %ds
- movl %edx, %es
- movl $(__KERNEL_PERCPU), %edx
- movl %edx, %fs
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+ movl $(__USER_DS), %edx
+ movl %edx, %ds
+ movl %edx, %es
+ movl $(__KERNEL_PERCPU), %edx
+ movl %edx, %fs
SET_KERNEL_GS %edx
.endm
.macro RESTORE_INT_REGS
- popl_cfi %ebx
- CFI_RESTORE ebx
- popl_cfi %ecx
- CFI_RESTORE ecx
- popl_cfi %edx
- CFI_RESTORE edx
- popl_cfi %esi
- CFI_RESTORE esi
- popl_cfi %edi
- CFI_RESTORE edi
- popl_cfi %ebp
- CFI_RESTORE ebp
- popl_cfi %eax
- CFI_RESTORE eax
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
.endm
.macro RESTORE_REGS pop=0
RESTORE_INT_REGS
-1: popl_cfi %ds
- /*CFI_RESTORE ds;*/
-2: popl_cfi %es
- /*CFI_RESTORE es;*/
-3: popl_cfi %fs
- /*CFI_RESTORE fs;*/
+1: popl %ds
+2: popl %es
+3: popl %fs
POP_GS \pop
.pushsection .fixup, "ax"
-4: movl $0, (%esp)
- jmp 1b
-5: movl $0, (%esp)
- jmp 2b
-6: movl $0, (%esp)
- jmp 3b
+4: movl $0, (%esp)
+ jmp 1b
+5: movl $0, (%esp)
+ jmp 2b
+6: movl $0, (%esp)
+ jmp 3b
.popsection
- _ASM_EXTABLE(1b,4b)
- _ASM_EXTABLE(2b,5b)
- _ASM_EXTABLE(3b,6b)
+ _ASM_EXTABLE(1b, 4b)
+ _ASM_EXTABLE(2b, 5b)
+ _ASM_EXTABLE(3b, 6b)
POP_GS_EX
.endm
-.macro RING0_INT_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 3*4
- /*CFI_OFFSET cs, -2*4;*/
- CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_EC_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 4*4
- /*CFI_OFFSET cs, -2*4;*/
- CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_PTREGS_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
- /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
- CFI_OFFSET eip, PT_EIP-PT_OLDESP
- /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
- /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
- CFI_OFFSET eax, PT_EAX-PT_OLDESP
- CFI_OFFSET ebp, PT_EBP-PT_OLDESP
- CFI_OFFSET edi, PT_EDI-PT_OLDESP
- CFI_OFFSET esi, PT_ESI-PT_OLDESP
- CFI_OFFSET edx, PT_EDX-PT_OLDESP
- CFI_OFFSET ecx, PT_ECX-PT_OLDESP
- CFI_OFFSET ebx, PT_EBX-PT_OLDESP
-.endm
-
ENTRY(ret_from_fork)
- CFI_STARTPROC
- pushl_cfi %eax
- call schedule_tail
+ pushl %eax
+ call schedule_tail
GET_THREAD_INFO(%ebp)
- popl_cfi %eax
- pushl_cfi $0x0202 # Reset kernel eflags
- popfl_cfi
- jmp syscall_exit
- CFI_ENDPROC
+ popl %eax
+ pushl $0x0202 # Reset kernel eflags
+ popfl
+ jmp syscall_exit
END(ret_from_fork)
ENTRY(ret_from_kernel_thread)
- CFI_STARTPROC
- pushl_cfi %eax
- call schedule_tail
+ pushl %eax
+ call schedule_tail
GET_THREAD_INFO(%ebp)
- popl_cfi %eax
- pushl_cfi $0x0202 # Reset kernel eflags
- popfl_cfi
- movl PT_EBP(%esp),%eax
- call *PT_EBX(%esp)
- movl $0,PT_EAX(%esp)
- jmp syscall_exit
- CFI_ENDPROC
+ popl %eax
+ pushl $0x0202 # Reset kernel eflags
+ popfl
+ movl PT_EBP(%esp), %eax
+ call *PT_EBX(%esp)
+ movl $0, PT_EAX(%esp)
+ jmp syscall_exit
ENDPROC(ret_from_kernel_thread)
/*
@@ -327,76 +250,70 @@ ENDPROC(ret_from_kernel_thread)
# userspace resumption stub bypassing syscall exit tracing
ALIGN
- RING0_PTREGS_FRAME
ret_from_exception:
preempt_stop(CLBR_ANY)
ret_from_intr:
GET_THREAD_INFO(%ebp)
#ifdef CONFIG_VM86
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
#else
/*
* We can be coming here from child spawned by kernel_thread().
*/
- movl PT_CS(%esp), %eax
- andl $SEGMENT_RPL_MASK, %eax
+ movl PT_CS(%esp), %eax
+ andl $SEGMENT_RPL_MASK, %eax
#endif
- cmpl $USER_RPL, %eax
- jb resume_kernel # not returning to v8086 or userspace
+ cmpl $USER_RPL, %eax
+ jb resume_kernel # not returning to v8086 or userspace
ENTRY(resume_userspace)
LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
+ # setting need_resched or sigpending
+ # between sampling and the iret
TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
- # int/exception return?
- jne work_pending
- jmp restore_all
+ movl TI_flags(%ebp), %ecx
+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
+ # int/exception return?
+ jne work_pending
+ jmp restore_all
END(ret_from_exception)
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
DISABLE_INTERRUPTS(CLBR_ANY)
need_resched:
- cmpl $0,PER_CPU_VAR(__preempt_count)
- jnz restore_all
- testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
- jz restore_all
- call preempt_schedule_irq
- jmp need_resched
+ cmpl $0, PER_CPU_VAR(__preempt_count)
+ jnz restore_all
+ testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
+ jz restore_all
+ call preempt_schedule_irq
+ jmp need_resched
END(resume_kernel)
#endif
- CFI_ENDPROC
-/* SYSENTER_RETURN points to after the "sysenter" instruction in
- the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
+/*
+ * SYSENTER_RETURN points to after the SYSENTER instruction
+ * in the vsyscall page. See vsyscall-sysentry.S, which defines
+ * the symbol.
+ */
- # sysenter call handler stub
-ENTRY(ia32_sysenter_target)
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 0
- CFI_REGISTER esp, ebp
- movl SYSENTER_stack_sp0(%esp),%esp
+ # SYSENTER call handler stub
+ENTRY(entry_SYSENTER_32)
+ movl SYSENTER_stack_sp0(%esp), %esp
sysenter_past_esp:
/*
* Interrupts are disabled here, but we can't trace it until
* enough kernel state to call TRACE_IRQS_OFF can be called - but
* we immediately enable interrupts at that point anyway.
*/
- pushl_cfi $__USER_DS
- /*CFI_REL_OFFSET ss, 0*/
- pushl_cfi %ebp
- CFI_REL_OFFSET esp, 0
- pushfl_cfi
- orl $X86_EFLAGS_IF, (%esp)
- pushl_cfi $__USER_CS
- /*CFI_REL_OFFSET cs, 0*/
+ pushl $__USER_DS
+ pushl %ebp
+ pushfl
+ orl $X86_EFLAGS_IF, (%esp)
+ pushl $__USER_CS
/*
* Push current_thread_info()->sysenter_return to the stack.
* A tiny bit of offset fixup is necessary: TI_sysenter_return
@@ -405,10 +322,9 @@ sysenter_past_esp:
* TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
* and THREAD_SIZE takes us to the bottom.
*/
- pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
- CFI_REL_OFFSET eip, 0
+ pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
- pushl_cfi %eax
+ pushl %eax
SAVE_ALL
ENABLE_INTERRUPTS(CLBR_NONE)
@@ -416,174 +332,167 @@ sysenter_past_esp:
* Load the potential sixth argument from user stack.
* Careful about security.
*/
- cmpl $__PAGE_OFFSET-3,%ebp
- jae syscall_fault
+ cmpl $__PAGE_OFFSET-3, %ebp
+ jae syscall_fault
ASM_STAC
-1: movl (%ebp),%ebp
+1: movl (%ebp), %ebp
ASM_CLAC
- movl %ebp,PT_EBP(%esp)
- _ASM_EXTABLE(1b,syscall_fault)
+ movl %ebp, PT_EBP(%esp)
+ _ASM_EXTABLE(1b, syscall_fault)
GET_THREAD_INFO(%ebp)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
- jnz sysenter_audit
+ testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
+ jnz sysenter_audit
sysenter_do_call:
- cmpl $(NR_syscalls), %eax
- jae sysenter_badsys
- call *sys_call_table(,%eax,4)
+ cmpl $(NR_syscalls), %eax
+ jae sysenter_badsys
+ call *sys_call_table(, %eax, 4)
sysenter_after_call:
- movl %eax,PT_EAX(%esp)
+ movl %eax, PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx
- jnz sysexit_audit
+ movl TI_flags(%ebp), %ecx
+ testl $_TIF_ALLWORK_MASK, %ecx
+ jnz sysexit_audit
sysenter_exit:
/* if something modifies registers it must also disable sysexit */
- movl PT_EIP(%esp), %edx
- movl PT_OLDESP(%esp), %ecx
- xorl %ebp,%ebp
+ movl PT_EIP(%esp), %edx
+ movl PT_OLDESP(%esp), %ecx
+ xorl %ebp, %ebp
TRACE_IRQS_ON
-1: mov PT_FS(%esp), %fs
+1: mov PT_FS(%esp), %fs
PTGS_TO_GS
ENABLE_INTERRUPTS_SYSEXIT
#ifdef CONFIG_AUDITSYSCALL
sysenter_audit:
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
- jnz syscall_trace_entry
- /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
- movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */
- /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
- pushl_cfi PT_ESI(%esp) /* a3: 5th arg */
- pushl_cfi PT_EDX+4(%esp) /* a2: 4th arg */
- call __audit_syscall_entry
- popl_cfi %ecx /* get that remapped edx off the stack */
- popl_cfi %ecx /* get that remapped esi off the stack */
- movl PT_EAX(%esp),%eax /* reload syscall number */
- jmp sysenter_do_call
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp)
+ jnz syscall_trace_entry
+ /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
+ movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */
+ /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
+ pushl PT_ESI(%esp) /* a3: 5th arg */
+ pushl PT_EDX+4(%esp) /* a2: 4th arg */
+ call __audit_syscall_entry
+ popl %ecx /* get that remapped edx off the stack */
+ popl %ecx /* get that remapped esi off the stack */
+ movl PT_EAX(%esp), %eax /* reload syscall number */
+ jmp sysenter_do_call
sysexit_audit:
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jnz syscall_exit_work
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+ jnz syscall_exit_work
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
- movl %eax,%edx /* second arg, syscall return value */
- cmpl $-MAX_ERRNO,%eax /* is it an error ? */
- setbe %al /* 1 if so, 0 if not */
- movzbl %al,%eax /* zero-extend that */
- call __audit_syscall_exit
+ movl %eax, %edx /* second arg, syscall return value */
+ cmpl $-MAX_ERRNO, %eax /* is it an error ? */
+ setbe %al /* 1 if so, 0 if not */
+ movzbl %al, %eax /* zero-extend that */
+ call __audit_syscall_exit
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jnz syscall_exit_work
- movl PT_EAX(%esp),%eax /* reload syscall return value */
- jmp sysenter_exit
+ movl TI_flags(%ebp), %ecx
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+ jnz syscall_exit_work
+ movl PT_EAX(%esp), %eax /* reload syscall return value */
+ jmp sysenter_exit
#endif
- CFI_ENDPROC
-.pushsection .fixup,"ax"
-2: movl $0,PT_FS(%esp)
- jmp 1b
+.pushsection .fixup, "ax"
+2: movl $0, PT_FS(%esp)
+ jmp 1b
.popsection
- _ASM_EXTABLE(1b,2b)
+ _ASM_EXTABLE(1b, 2b)
PTGS_TO_GS_EX
-ENDPROC(ia32_sysenter_target)
+ENDPROC(entry_SYSENTER_32)
# pv sysenter call handler stub
-ENTRY(ia32pv_sysenter_target)
- RING0_INT_FRAME
+ENTRY(entry_SYSENTER_PV32)
ASM_CLAC
- movl $__USER_DS,16(%esp)
- movl %ebp,12(%esp)
- movl $__USER_CS,4(%esp)
- addl $4,%esp
- CFI_ADJUST_CFA_OFFSET -4
+ movl $__USER_DS, 16(%esp)
+ movl %ebp, 12(%esp)
+ movl $__USER_CS, 4(%esp)
+ addl $4, %esp
/* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */
- pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+ pushl TI_sysenter_return-THREAD_SIZE+8+4*4(%esp)
/*
* Load the potential sixth argument from user stack.
* Careful about security.
*/
- cmpl $__PAGE_OFFSET-3,%ebp
- jae syscall_fault
+ cmpl $__PAGE_OFFSET-3, %ebp
+ jae syscall_fault
ASM_STAC
-1: movl (%ebp),%ebp
+1: movl (%ebp), %ebp
ASM_CLAC
_ASM_EXTABLE(1b,syscall_fault)
- jmp system_call
- CFI_ENDPROC
-ENDPROC(ia32pv_sysenter_target)
+ jmp entry_INT80_32
+ENDPROC(entry_SYSENTER_PV32)
# system call handler stub
-ENTRY(system_call)
- RING0_INT_FRAME # can't unwind into user space anyway
+ENTRY(entry_INT80_32)
ASM_CLAC
- pushl_cfi %eax # save orig_eax
+ pushl %eax # save orig_eax
SAVE_ALL
GET_THREAD_INFO(%ebp)
- # system call tracing in operation / emulation
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
- jnz syscall_trace_entry
- cmpl $(NR_syscalls), %eax
- jae syscall_badsys
+ # system call tracing in operation / emulation
+ testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
+ jnz syscall_trace_entry
+ cmpl $(NR_syscalls), %eax
+ jae syscall_badsys
syscall_call:
- call *sys_call_table(,%eax,4)
+ call *sys_call_table(, %eax, 4)
syscall_after_call:
- movl %eax,PT_EAX(%esp) # store the return value
+ movl %eax, PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
+ # setting need_resched or sigpending
+ # between sampling and the iret
TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx # current->work
- jnz syscall_exit_work
+ movl TI_flags(%ebp), %ecx
+ testl $_TIF_ALLWORK_MASK, %ecx # current->work
+ jnz syscall_exit_work
restore_all:
TRACE_IRQS_IRET
restore_all_notrace:
#ifdef CONFIG_X86_ESPFIX32
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
- # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
- # are returning to the kernel.
- # See comments in process.c:copy_thread() for details.
- movb PT_OLDSS(%esp), %ah
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
- cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
- CFI_REMEMBER_STATE
- je ldt_ss # returning to user-space with LDT SS
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
+ /*
+ * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+ * are returning to the kernel.
+ * See comments in process.c:copy_thread() for details.
+ */
+ movb PT_OLDSS(%esp), %ah
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+ je ldt_ss # returning to user-space with LDT SS
#endif
restore_nocheck:
#ifdef CONFIG_XEN
- movl PT_EFLAGS(%esp), %eax
- testl $(X86_EFLAGS_VM|NMI_MASK), %eax
- CFI_REMEMBER_STATE
- jnz hypervisor_iret
- shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
+ movl PT_EFLAGS(%esp), %eax
+ testl $(X86_EFLAGS_VM|NMI_MASK), %eax
+ jnz hypervisor_iret
+ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
GET_VCPU_INFO
- andb evtchn_upcall_mask(%esi),%al
- andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
- CFI_REMEMBER_STATE
- jnz restore_all_enable_events # != 0 => enable event delivery
+ andb evtchn_upcall_mask(%esi),%al
+ andb $1, %al # EAX[0] == IRET_EFLAGS.IF & event_mask
+ jnz restore_all_enable_events # != 0 => enable event delivery
#endif
- RESTORE_REGS 4 # skip orig_eax/error_code
+ RESTORE_REGS 4 # skip orig_eax/error_code
irq_return:
INTERRUPT_RETURN
-.section .fixup,"ax"
-ENTRY(iret_exc)
- pushl $0 # no error code
- pushl $do_iret_error
- jmp error_code
+.section .fixup, "ax"
+ENTRY(iret_exc )
+ pushl $0 # no error code
+ pushl $do_iret_error
+ jmp error_code
.previous
- _ASM_EXTABLE(irq_return,iret_exc)
+ _ASM_EXTABLE(irq_return, iret_exc)
- CFI_RESTORE_STATE
#ifdef CONFIG_X86_ESPFIX32
ldt_ss:
#ifdef CONFIG_PARAVIRT
@@ -595,8 +504,8 @@ ldt_ss:
* is still available to implement the setting of the high
* 16-bits in the INTERRUPT_RETURN paravirt-op.
*/
- cmpl $0, pv_info+PARAVIRT_enabled
- jne restore_nocheck
+ cmpl $0, pv_info+PARAVIRT_enabled
+ jne restore_nocheck
#endif
/*
@@ -611,22 +520,23 @@ ldt_ss:
* a base address that matches for the difference.
*/
#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
- mov %esp, %edx /* load kernel esp */
- mov PT_OLDESP(%esp), %eax /* load userspace esp */
- mov %dx, %ax /* eax: new kernel esp */
- sub %eax, %edx /* offset (low word is 0) */
- shr $16, %edx
- mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
- mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
- pushl_cfi $__ESPFIX_SS
- pushl_cfi %eax /* new kernel esp */
- /* Disable interrupts, but do not irqtrace this section: we
+ mov %esp, %edx /* load kernel esp */
+ mov PT_OLDESP(%esp), %eax /* load userspace esp */
+ mov %dx, %ax /* eax: new kernel esp */
+ sub %eax, %edx /* offset (low word is 0) */
+ shr $16, %edx
+ mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
+ pushl $__ESPFIX_SS
+ pushl %eax /* new kernel esp */
+ /*
+ * Disable interrupts, but do not irqtrace this section: we
* will soon execute iret and the tracer was already set to
- * the irqstate after the iret */
+ * the irqstate after the IRET:
+ */
DISABLE_INTERRUPTS(CLBR_EAX)
- lss (%esp), %esp /* switch to espfix segment */
- CFI_ADJUST_CFA_OFFSET -8
- jmp restore_nocheck
+ lss (%esp), %esp /* switch to espfix segment */
+ jmp restore_nocheck
#endif
#ifdef CONFIG_XEN
ALIGN
@@ -635,120 +545,114 @@ restore_all_enable_events:
__ENABLE_INTERRUPTS
.Lscrit: /**** START OF CRITICAL REGION ****/
__TEST_PENDING
- jnz 14f # process more events if necessary...
+ jnz 14f # process more events if necessary...
RESTORE_REGS 4
1: INTERRUPT_RETURN
_ASM_EXTABLE(1b,iret_exc)
14: __DISABLE_INTERRUPTS
.Lecrit: /**** END OF CRITICAL REGION ****/
TRACE_IRQS_OFF
- jmp .Ldo_upcall
+ jmp .Ldo_upcall
- CFI_RESTORE_STATE
hypervisor_iret:
- andl $~NMI_MASK, PT_EFLAGS(%esp)
+ andl $~NMI_MASK, PT_EFLAGS(%esp)
RESTORE_REGS 4
- jmp hypercall_page + (__HYPERVISOR_iret * 32)
+ jmp hypercall_page + (__HYPERVISOR_iret * 32)
#endif
- CFI_ENDPROC
-ENDPROC(system_call)
+ENDPROC(entry_INT80_32)
# perform work that needs to be done immediately before resumption
ALIGN
- RING0_PTREGS_FRAME # can't unwind into user space anyway
work_pending:
- testb $_TIF_NEED_RESCHED, %cl
- jz work_notifysig
+ testb $_TIF_NEED_RESCHED, %cl
+ jz work_notifysig
work_resched:
- call schedule
+ call schedule
LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
+ # setting need_resched or sigpending
+ # between sampling and the iret
TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
- # than syscall tracing?
- jz restore_all
- testb $_TIF_NEED_RESCHED, %cl
- jnz work_resched
+ movl TI_flags(%ebp), %ecx
+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
+ # than syscall tracing?
+ jz restore_all
+ testb $_TIF_NEED_RESCHED, %cl
+ jnz work_resched
-work_notifysig: # deal with pending signals and
- # notify-resume requests
+work_notifysig: # deal with pending signals and
+ # notify-resume requests
#ifdef CONFIG_VM86
- testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
- movl %esp, %eax
- jnz work_notifysig_v86 # returning to kernel-space or
- # vm86-space
+ testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
+ movl %esp, %eax
+ jnz work_notifysig_v86 # returning to kernel-space or
+ # vm86-space
1:
#else
- movl %esp, %eax
+ movl %esp, %eax
#endif
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- movb PT_CS(%esp), %bl
- andb $SEGMENT_RPL_MASK, %bl
- cmpb $USER_RPL, %bl
- jb resume_kernel
- xorl %edx, %edx
- call do_notify_resume
- jmp resume_userspace
+ movb PT_CS(%esp), %bl
+ andb $SEGMENT_RPL_MASK, %bl
+ cmpb $USER_RPL, %bl
+ jb resume_kernel
+ xorl %edx, %edx
+ call do_notify_resume
+ jmp resume_userspace
#ifdef CONFIG_VM86
ALIGN
work_notifysig_v86:
- pushl_cfi %ecx # save ti_flags for do_notify_resume
- call save_v86_state # %eax contains pt_regs pointer
- popl_cfi %ecx
- movl %eax, %esp
- jmp 1b
+ pushl %ecx # save ti_flags for do_notify_resume
+ call save_v86_state # %eax contains pt_regs pointer
+ popl %ecx
+ movl %eax, %esp
+ jmp 1b
#endif
END(work_pending)
# perform syscall exit tracing
ALIGN
syscall_trace_entry:
- movl $-ENOSYS,PT_EAX(%esp)
- movl %esp, %eax
- call syscall_trace_enter
+ movl $-ENOSYS, PT_EAX(%esp)
+ movl %esp, %eax
+ call syscall_trace_enter
/* What it returned is what we'll actually use. */
- cmpl $(NR_syscalls), %eax
- jnae syscall_call
- jmp syscall_exit
+ cmpl $(NR_syscalls), %eax
+ jnae syscall_call
+ jmp syscall_exit
END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
syscall_exit_work:
- testl $_TIF_WORK_SYSCALL_EXIT, %ecx
- jz work_pending
+ testl $_TIF_WORK_SYSCALL_EXIT, %ecx
+ jz work_pending
TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
- # schedule() instead
- movl %esp, %eax
- call syscall_trace_leave
- jmp resume_userspace
+ ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
+ # schedule() instead
+ movl %esp, %eax
+ call syscall_trace_leave
+ jmp resume_userspace
END(syscall_exit_work)
- CFI_ENDPROC
- RING0_INT_FRAME # can't unwind into user space anyway
syscall_fault:
ASM_CLAC
GET_THREAD_INFO(%ebp)
- movl $-EFAULT,PT_EAX(%esp)
- jmp resume_userspace
+ movl $-EFAULT, PT_EAX(%esp)
+ jmp resume_userspace
END(syscall_fault)
syscall_badsys:
- movl $-ENOSYS,%eax
- jmp syscall_after_call
+ movl $-ENOSYS, %eax
+ jmp syscall_after_call
END(syscall_badsys)
sysenter_badsys:
- movl $-ENOSYS,%eax
- jmp sysenter_after_call
+ movl $-ENOSYS, %eax
+ jmp sysenter_after_call
END(sysenter_badsys)
- CFI_ENDPROC
#ifndef CONFIG_XEN
.macro FIXUP_ESPFIX_STACK
@@ -761,25 +665,24 @@ END(sysenter_badsys)
*/
#ifdef CONFIG_X86_ESPFIX32
/* fixup the stack */
- mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
- mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
- shl $16, %eax
- addl %esp, %eax /* the adjusted stack pointer */
- pushl_cfi $__KERNEL_DS
- pushl_cfi %eax
- lss (%esp), %esp /* switch to the normal stack segment */
- CFI_ADJUST_CFA_OFFSET -8
+ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
+ shl $16, %eax
+ addl %esp, %eax /* the adjusted stack pointer */
+ pushl $__KERNEL_DS
+ pushl %eax
+ lss (%esp), %esp /* switch to the normal stack segment */
#endif
.endm
.macro UNWIND_ESPFIX_STACK
#ifdef CONFIG_X86_ESPFIX32
- movl %ss, %eax
+ movl %ss, %eax
/* see if on espfix stack */
- cmpw $__ESPFIX_SS, %ax
- jne 27f
- movl $__KERNEL_DS, %eax
- movl %eax, %ds
- movl %eax, %es
+ cmpw $__ESPFIX_SS, %ax
+ jne 27f
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
/* switch to normal stack */
FIXUP_ESPFIX_STACK
27:
@@ -792,13 +695,11 @@ END(sysenter_badsys)
*/
.align 8
ENTRY(irq_entries_start)
- RING0_INT_FRAME
vector=FIRST_EXTERNAL_VECTOR
.rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
+ pushl $(~vector+0x80) /* Note: always in signed byte range */
vector=vector+1
jmp common_interrupt
- CFI_ADJUST_CFA_OFFSET -4
.align 8
.endr
END(irq_entries_start)
@@ -810,38 +711,34 @@ END(irq_entries_start)
.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
ASM_CLAC
- addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
+ addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
SAVE_ALL
TRACE_IRQS_OFF
- movl %esp,%eax
- call do_IRQ
- jmp ret_from_intr
+ movl %esp, %eax
+ call do_IRQ
+ jmp ret_from_intr
ENDPROC(common_interrupt)
- CFI_ENDPROC
#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
- RING0_INT_FRAME; \
ASM_CLAC; \
- pushl_cfi $~(nr); \
+ pushl $~(nr); \
SAVE_ALL; \
TRACE_IRQS_OFF \
- movl %esp,%eax; \
- call fn; \
- jmp ret_from_intr; \
- CFI_ENDPROC; \
+ movl %esp, %eax; \
+ call fn; \
+ jmp ret_from_intr; \
ENDPROC(name)
#ifdef CONFIG_TRACING
-#define TRACE_BUILD_INTERRUPT(name, nr) \
- BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
+# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
#else
-#define TRACE_BUILD_INTERRUPT(name, nr)
+# define TRACE_BUILD_INTERRUPT(name, nr)
#endif
-#define BUILD_INTERRUPT(name, nr) \
- BUILD_INTERRUPT3(name, nr, smp_##name); \
+#define BUILD_INTERRUPT(name, nr) \
+ BUILD_INTERRUPT3(name, nr, smp_##name); \
TRACE_BUILD_INTERRUPT(name, nr)
/* The include is where all of the SMP etc. interrupts come from */
@@ -867,34 +764,31 @@ ENDPROC(name)
# critical region we know that the entire frame is present and correct
# so we can simply throw away the new one.
ENTRY(hypervisor_callback)
- RING0_INT_FRAME
ASM_CLAC
- pushl_cfi $-1
+ pushl $-1
SAVE_ALL
- movl PT_CS(%esp),%ecx
- movl PT_EIP(%esp),%eax
- andl $SEGMENT_RPL_MASK,%ecx
- cmpl $USER_RPL,%ecx
- jae .Ldo_upcall
- cmpl $.Lscrit,%eax
- jb 0f
- cmpl $.Lecrit,%eax
- jb critical_region_fixup
+ movl PT_CS(%esp), %ecx
+ movl PT_EIP(%esp), %eax
+ andl $SEGMENT_RPL_MASK, %ecx
+ cmpl $USER_RPL, %ecx
+ jae .Ldo_upcall
+ cmpl $.Lscrit, %eax
+ jb 0f
+ cmpl $.Lecrit, %eax
+ jb critical_region_fixup
0:
#ifdef CONFIG_XEN_SUPERVISOR_MODE_KERNEL
- cmpl $.Lsysexit_scrit,%eax
- jb .Ldo_upcall
- cmpl $.Lsysexit_ecrit,%eax
- ja .Ldo_upcall
- addl $PT_OLDESP,%esp # Remove eflags...ebx from stack frame.
+ cmpl $.Lsysexit_scrit, %eax
+ jb .Ldo_upcall
+ cmpl $.Lsysexit_ecrit, %eax
+ ja .Ldo_upcall
+ addl $PT_OLDESP, %esp # Remove eflags...ebx from stack frame.
#endif
.Ldo_upcall:
- pushl_cfi %esp
- call evtchn_do_upcall
- add $4,%esp
- CFI_ADJUST_CFA_OFFSET -4
- jmp ret_from_intr
- CFI_ENDPROC
+ pushl %esp
+ call evtchn_do_upcall
+ add $4, %esp
+ jmp ret_from_intr
# [How we do the fixup]. We want to merge the current stack frame with the
# just-interrupted frame. How we do this depends on where in the critical
@@ -904,18 +798,18 @@ ENTRY(hypervisor_callback)
# provides the number of bytes which have already been popped from the
# interrupted stack frame.
critical_region_fixup:
- movsbl critical_fixup_table-.Lscrit(%eax),%ecx # %ecx contains num slots popped
- testl %ecx,%ecx
- leal (%esp,%ecx,4),%esi # %esi points at end of src region
- leal PT_OLDESP(%esp),%edi # %edi points at end of dst region
- jle 17f # skip loop if nothing to copy
-16: subl $4,%esi # pre-decrementing copy loop
- subl $4,%edi
- movl (%esi),%eax
- movl %eax,(%edi)
- loop 16b
-17: movl %edi,%esp # final %edi is top of merged stack
- jmp .Ldo_upcall
+ movsbl critical_fixup_table-.Lscrit(%eax),%ecx # %ecx contains num slots popped
+ testl %ecx, %ecx
+ leal (%esp,%ecx,4), %esi # %esi points at end of src region
+ leal PT_OLDESP(%esp), %edi # %edi points at end of dst region
+ jle 17f # skip loop if nothing to copy
+16: subl $4, %esi # pre-decrementing copy loop
+ subl $4, %edi
+ movl (%esi), %eax
+ movl %eax, (%edi)
+ loop 16b
+17: movl %edi, %esp # final %edi is top of merged stack
+ jmp .Ldo_upcall
.section .rodata,"a"
critical_fixup_table:
@@ -953,73 +847,65 @@ critical_fixup_table:
# We distinguish between categories by maintaining a status value in EAX.
ENTRY(failsafe_callback)
ASM_CLAC
- pushl %eax
- movl $1,%eax
-1: mov 4(%esp),%ds
-2: mov 8(%esp),%es
-3: mov 12(%esp),%fs
-4: mov 16(%esp),%gs
- testl %eax,%eax
- popl %eax
- leal 16(%esp),%esp
- RING0_INT_FRAME
- jnz iret_exc # EAX != 0 => Category 2 (Bad IRET)
- pushl_cfi $-1 # EAX == 0 => Category 1 (Bad segment)
+ pushl %eax
+ movl $1, %eax
+1: mov 4(%esp), %ds
+2: mov 8(%esp), %es
+3: mov 12(%esp), %fs
+4: mov 16(%esp), %gs
+ testl %eax, %eax
+ popl %eax
+ leal 16(%esp), %esp
+ jnz iret_exc # EAX != 0 => Category 2 (Bad IRET)
+ pushl $-1 # EAX == 0 => Category 1 (Bad segment)
SAVE_ALL
- jmp ret_from_exception
-.section .fixup,"ax"; \
-6: xorl %eax,%eax; \
- movl %eax,4(%esp); \
- jmp 1b; \
-7: xorl %eax,%eax; \
- movl %eax,8(%esp); \
- jmp 2b; \
-8: xorl %eax,%eax; \
- movl %eax,12(%esp); \
- jmp 3b; \
-9: xorl %eax,%eax; \
- movl %eax,16(%esp); \
- jmp 4b; \
+ jmp ret_from_exception
+.section .fixup,"ax"
+6: xorl %eax, %eax
+ movl %eax, 4(%esp)
+ jmp 1b
+7: xorl %eax, %eax
+ movl %eax, 8(%esp)
+ jmp 2b
+8: xorl %eax, %eax
+ movl %eax, 12(%esp)
+ jmp 3b
+9: xorl %eax, %eax
+ movl %eax, 16(%esp)
+ jmp 4b
.previous
_ASM_EXTABLE(1b,6b)
_ASM_EXTABLE(2b,7b)
_ASM_EXTABLE(3b,8b)
_ASM_EXTABLE(4b,9b)
#endif
- CFI_ENDPROC
ENTRY(coprocessor_error)
- RING0_INT_FRAME
ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_coprocessor_error
- jmp error_code
- CFI_ENDPROC
+ pushl $0
+ pushl $do_coprocessor_error
+ jmp error_code
END(coprocessor_error)
ENTRY(simd_coprocessor_error)
- RING0_INT_FRAME
ASM_CLAC
- pushl_cfi $0
+ pushl $0
#ifdef CONFIG_X86_INVD_BUG
/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
- ALTERNATIVE "pushl_cfi $do_general_protection", \
- "pushl $do_simd_coprocessor_error", \
+ ALTERNATIVE "pushl $do_general_protection", \
+ "pushl $do_simd_coprocessor_error", \
X86_FEATURE_XMM
#else
- pushl_cfi $do_simd_coprocessor_error
+ pushl $do_simd_coprocessor_error
#endif
- jmp error_code
- CFI_ENDPROC
+ jmp error_code
END(simd_coprocessor_error)
ENTRY(device_not_available)
- RING0_INT_FRAME
ASM_CLAC
- pushl_cfi $-1 # mark this as an int
- pushl_cfi $do_device_not_available
- jmp error_code
- CFI_ENDPROC
+ pushl $-1 # mark this as an int
+ pushl $do_device_not_available
+ jmp error_code
END(device_not_available)
#ifdef CONFIG_PARAVIRT
@@ -1035,109 +921,85 @@ END(native_irq_enable_sysexit)
#endif
ENTRY(overflow)
- RING0_INT_FRAME
ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_overflow
- jmp error_code
- CFI_ENDPROC
+ pushl $0
+ pushl $do_overflow
+ jmp error_code
END(overflow)
ENTRY(bounds)
- RING0_INT_FRAME
ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_bounds
- jmp error_code
- CFI_ENDPROC
+ pushl $0
+ pushl $do_bounds
+ jmp error_code
END(bounds)
ENTRY(invalid_op)
- RING0_INT_FRAME
ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_invalid_op
- jmp error_code
- CFI_ENDPROC
+ pushl $0
+ pushl $do_invalid_op
+ jmp error_code
END(invalid_op)
ENTRY(coprocessor_segment_overrun)
- RING0_INT_FRAME
ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_coprocessor_segment_overrun
- jmp error_code
- CFI_ENDPROC
+ pushl $0
+ pushl $do_coprocessor_segment_overrun
+ jmp error_code
END(coprocessor_segment_overrun)
ENTRY(invalid_TSS)
- RING0_EC_FRAME
ASM_CLAC
- pushl_cfi $do_invalid_TSS
- jmp error_code
- CFI_ENDPROC
+ pushl $do_invalid_TSS
+ jmp error_code
END(invalid_TSS)
ENTRY(segment_not_present)
- RING0_EC_FRAME
ASM_CLAC
- pushl_cfi $do_segment_not_present
- jmp error_code
- CFI_ENDPROC
+ pushl $do_segment_not_present
+ jmp error_code
END(segment_not_present)
ENTRY(stack_segment)
- RING0_EC_FRAME
ASM_CLAC
- pushl_cfi $do_stack_segment
- jmp error_code
- CFI_ENDPROC
+ pushl $do_stack_segment
+ jmp error_code
END(stack_segment)
ENTRY(alignment_check)
- RING0_EC_FRAME
ASM_CLAC
- pushl_cfi $do_alignment_check
- jmp error_code
- CFI_ENDPROC
+ pushl $do_alignment_check
+ jmp error_code
END(alignment_check)
ENTRY(divide_error)
- RING0_INT_FRAME
ASM_CLAC
- pushl_cfi $0 # no error code
- pushl_cfi $do_divide_error
- jmp error_code
- CFI_ENDPROC
+ pushl $0 # no error code
+ pushl $do_divide_error
+ jmp error_code
END(divide_error)
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
- RING0_INT_FRAME
ASM_CLAC
- pushl_cfi $0
- pushl_cfi machine_check_vector
- jmp error_code
- CFI_ENDPROC
+ pushl $0
+ pushl machine_check_vector
+ jmp error_code
END(machine_check)
#endif
#ifndef CONFIG_XEN
ENTRY(spurious_interrupt_bug)
- RING0_INT_FRAME
ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_spurious_interrupt_bug
- jmp error_code
- CFI_ENDPROC
+ pushl $0
+ pushl $do_spurious_interrupt_bug
+ jmp error_code
END(spurious_interrupt_bug)
#endif /* !CONFIG_XEN */
ENTRY(fixup_4gb_segment)
- RING0_EC_FRAME
- pushl_cfi $do_fixup_4gb_segment
+ pushl $do_fixup_4gb_segment
jmp error_code
- CFI_ENDPROC
END(fixup_4gb_segment)
#ifdef CONFIG_FUNCTION_TRACER
@@ -1148,28 +1010,28 @@ ENTRY(mcount)
END(mcount)
ENTRY(ftrace_caller)
- pushl %eax
- pushl %ecx
- pushl %edx
- pushl $0 /* Pass NULL as regs pointer */
- movl 4*4(%esp), %eax
- movl 0x4(%ebp), %edx
- movl function_trace_op, %ecx
- subl $MCOUNT_INSN_SIZE, %eax
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ pushl $0 /* Pass NULL as regs pointer */
+ movl 4*4(%esp), %eax
+ movl 0x4(%ebp), %edx
+ movl function_trace_op, %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
.globl ftrace_call
ftrace_call:
- call ftrace_stub
+ call ftrace_stub
- addl $4,%esp /* skip NULL pointer */
- popl %edx
- popl %ecx
- popl %eax
+ addl $4, %esp /* skip NULL pointer */
+ popl %edx
+ popl %ecx
+ popl %eax
ftrace_ret:
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.globl ftrace_graph_call
ftrace_graph_call:
- jmp ftrace_stub
+ jmp ftrace_stub
#endif
.globl ftrace_stub
@@ -1187,72 +1049,72 @@ ENTRY(ftrace_regs_caller)
* as the current return ip is. We move the return ip into the
* ip location, and move flags into the return ip location.
*/
- pushl 4(%esp) /* save return ip into ip slot */
+ pushl 4(%esp) /* save return ip into ip slot */
- pushl $0 /* Load 0 into orig_ax */
- pushl %gs
- pushl %fs
- pushl %es
- pushl %ds
- pushl %eax
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %edx
- pushl %ecx
- pushl %ebx
-
- movl 13*4(%esp), %eax /* Get the saved flags */
- movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
- /* clobbering return ip */
- movl $__KERNEL_CS,13*4(%esp)
-
- movl 12*4(%esp), %eax /* Load ip (1st parameter) */
- subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
- movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
- movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
- pushl %esp /* Save pt_regs as 4th parameter */
+ pushl $0 /* Load 0 into orig_ax */
+ pushl %gs
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+
+ movl 13*4(%esp), %eax /* Get the saved flags */
+ movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
+ /* clobbering return ip */
+ movl $__KERNEL_CS, 13*4(%esp)
+
+ movl 12*4(%esp), %eax /* Load ip (1st parameter) */
+ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
+ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
+ movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ pushl %esp /* Save pt_regs as 4th parameter */
GLOBAL(ftrace_regs_call)
- call ftrace_stub
+ call ftrace_stub
- addl $4, %esp /* Skip pt_regs */
- movl 14*4(%esp), %eax /* Move flags back into cs */
- movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
- movl 12*4(%esp), %eax /* Get return ip from regs->ip */
- movl %eax, 14*4(%esp) /* Put return ip back for ret */
-
- popl %ebx
- popl %ecx
- popl %edx
- popl %esi
- popl %edi
- popl %ebp
- popl %eax
- popl %ds
- popl %es
- popl %fs
- popl %gs
- addl $8, %esp /* Skip orig_ax and ip */
- popf /* Pop flags at end (no addl to corrupt flags) */
- jmp ftrace_ret
+ addl $4, %esp /* Skip pt_regs */
+ movl 14*4(%esp), %eax /* Move flags back into cs */
+ movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
+ movl 12*4(%esp), %eax /* Get return ip from regs->ip */
+ movl %eax, 14*4(%esp) /* Put return ip back for ret */
+
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+ popl %ds
+ popl %es
+ popl %fs
+ popl %gs
+ addl $8, %esp /* Skip orig_ax and ip */
+ popf /* Pop flags at end (no addl to corrupt flags) */
+ jmp ftrace_ret
popf
- jmp ftrace_stub
+ jmp ftrace_stub
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
- cmpl $__PAGE_OFFSET, %esp
- jb ftrace_stub /* Paging not enabled yet? */
+ cmpl $__PAGE_OFFSET, %esp
+ jb ftrace_stub /* Paging not enabled yet? */
- cmpl $ftrace_stub, ftrace_trace_function
- jnz trace
+ cmpl $ftrace_stub, ftrace_trace_function
+ jnz trace
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpl $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
+ cmpl $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
- cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
+ cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
#endif
.globl ftrace_stub
ftrace_stub:
@@ -1260,150 +1122,144 @@ ftrace_stub:
/* taken from glibc */
trace:
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- movl 0x4(%ebp), %edx
- subl $MCOUNT_INSN_SIZE, %eax
-
- call *ftrace_trace_function
-
- popl %edx
- popl %ecx
- popl %eax
- jmp ftrace_stub
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ movl 0x4(%ebp), %edx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+ call *ftrace_trace_function
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ jmp ftrace_stub
END(mcount)
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- lea 0x4(%ebp), %edx
- movl (%ebp), %ecx
- subl $MCOUNT_INSN_SIZE, %eax
- call prepare_ftrace_return
- popl %edx
- popl %ecx
- popl %eax
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ lea 0x4(%ebp), %edx
+ movl (%ebp), %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
ret
END(ftrace_graph_caller)
.globl return_to_handler
return_to_handler:
- pushl %eax
- pushl %edx
- movl %ebp, %eax
- call ftrace_return_to_handler
- movl %eax, %ecx
- popl %edx
- popl %eax
- jmp *%ecx
+ pushl %eax
+ pushl %edx
+ movl %ebp, %eax
+ call ftrace_return_to_handler
+ movl %eax, %ecx
+ popl %edx
+ popl %eax
+ jmp *%ecx
#endif
#ifdef TIF_CSTAR
# pv syscall call handler stub
-ENTRY(ia32pv_cstar_target)
- RING0_INT_FRAME
+ENTRY(entry_SYSCALL_PV32)
ASM_CLAC
- movl $__USER_DS,16(%esp)
- movl %ebp,%ecx
- movl $__USER_CS,4(%esp)
- movl 12(%esp),%ebp
- pushl_cfi %eax # save orig_eax
+ movl $__USER_DS, 16(%esp)
+ movl %ebp, %ecx
+ movl $__USER_CS, 4(%esp)
+ movl 12(%esp), %ebp
+ pushl %eax # save orig_eax
/*
* Load the potential sixth argument from user stack.
* Careful about security.
*/
- cmpl $__PAGE_OFFSET-4,%ebp
- CFI_REMEMBER_STATE
- ja cstar_fault
+ cmpl $__PAGE_OFFSET-4, %ebp
+ ja cstar_fault
ASM_STAC
-1: movl (%ebp),%ebp
+1: movl (%ebp), %ebp
ASM_CLAC
_ASM_EXTABLE(1b,cstar_fault)
SAVE_ALL
GET_THREAD_INFO(%ebp)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
- jnz cstar_trace_entry
- cmpl $NR_syscalls,%eax
- jae cstar_badsys
+ testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
+ jnz cstar_trace_entry
+ cmpl $NR_syscalls, %eax
+ jae cstar_badsys
.Lcstar_call:
- btl %eax,cstar_special
- jc .Lcstar_special
- call *cstar_call_table(,%eax,4)
+ btl %eax, cstar_special
+ jc .Lcstar_special
+ call *cstar_call_table(,%eax,4)
.Lcstar_after_call:
- movl %eax,PT_EAX(%esp) # store the return value
+ movl %eax, PT_EAX(%esp) # store the return value
.Lcstar_exit:
- movl PT_ECX(%esp),%ecx
- movl %ecx,PT_EBP(%esp) # put user EBP back in place
- jmp syscall_exit
+ movl PT_ECX(%esp), %ecx
+ movl %ecx, PT_EBP(%esp) # put user EBP back in place
+ jmp syscall_exit
.Lcstar_special:
- movl PT_ECX(%esp),%ecx
- movl %ecx,PT_EBP(%esp) # put user EBP back in place
- jmp syscall_call
+ movl PT_ECX(%esp), %ecx
+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
+ jmp syscall_call
GLOBAL(cstar_set_tif)
- movl $cstar_clear_tif,(%esp) # replace return address
+ movl $cstar_clear_tif, (%esp) # replace return address
LOCK_PREFIX
- orl $_TIF_CSTAR,TI_flags(%ebp)
- jmp *sys_call_table(,%eax,4)
+ orl $_TIF_CSTAR, TI_flags(%ebp)
+ jmp *sys_call_table(,%eax,4)
cstar_clear_tif:
LOCK_PREFIX
- andl $~_TIF_CSTAR,TI_flags(%ebp)
- jmp .Lcstar_after_call
+ andl $~_TIF_CSTAR, TI_flags(%ebp)
+ jmp .Lcstar_after_call
cstar_trace_entry:
- movl $-ENOSYS,PT_EAX(%esp)
- cmpl $NR_syscalls,%eax
- jae 1f
- btl %eax,cstar_special
- jc .Lcstar_trace_special
-1: movl %esp,%eax
+ movl $-ENOSYS, PT_EAX(%esp)
+ cmpl $NR_syscalls, %eax
+ jae 1f
+ btl %eax, cstar_special
+ jc .Lcstar_trace_special
+1: movl %esp, %eax
LOCK_PREFIX
- orl $_TIF_CSTAR,TI_flags(%ebp)
- call syscall_trace_enter
+ orl $_TIF_CSTAR, TI_flags(%ebp)
+ call syscall_trace_enter
LOCK_PREFIX
- andl $~_TIF_CSTAR,TI_flags(%ebp)
+ andl $~_TIF_CSTAR, TI_flags(%ebp)
/* What it returned is what we'll actually use. */
- cmpl $NR_syscalls,%eax
- jb .Lcstar_call
- jmp .Lcstar_exit
+ cmpl $NR_syscalls, %eax
+ jb .Lcstar_call
+ jmp .Lcstar_exit
.Lcstar_trace_special:
- movl PT_ECX(%esp),%ecx
- movl %esp,%eax
- movl %ecx,PT_EBP(%esp) # put user EBP back in place
- call syscall_trace_enter
+ movl PT_ECX(%esp), %ecx
+ movl %esp, %eax
+ movl %ecx, PT_EBP(%esp) # put user EBP back in place
+ call syscall_trace_enter
/* What it returned is what we'll actually use. */
- cmpl $NR_syscalls,%eax
- jb syscall_call
- jmp syscall_exit
+ cmpl $NR_syscalls, %eax
+ jb syscall_call
+ jmp syscall_exit
cstar_badsys:
- movl $-ENOSYS,%eax
- jmp .Lcstar_after_call
- CFI_RESTORE_STATE
+ movl $-ENOSYS, %eax
+ jmp .Lcstar_after_call
cstar_fault:
ASM_CLAC
- movl $-EFAULT,%eax
+ movl $-EFAULT, %eax
SAVE_ALL
GET_THREAD_INFO(%ebp)
- movl PT_ECX(%esp),%ecx
- movl %ecx,PT_EBP(%esp) # put user EBP back in place
- jmp resume_userspace
- CFI_ENDPROC
-ENDPROC(ia32pv_cstar_target)
+ movl PT_ECX(%esp), %ecx
+ movl %ecx, PT_EBP(%esp) # put user EBP back in place
+ jmp resume_userspace
+ENDPROC(entry_SYSCALL_PV32)
ENTRY(cstar_ret_from_fork)
- CFI_STARTPROC
- movl PT_ECX(%esp),%ecx
+ movl PT_ECX(%esp), %ecx
GET_THREAD_INFO(%ebp)
- movl %ecx,PT_EBP(%esp) # put user EBP back in place
+ movl %ecx, PT_EBP(%esp) # put user EBP back in place
LOCK_PREFIX
- andl $~_TIF_CSTAR,TI_flags(%ebp)
- jmp ret_from_fork
- CFI_ENDPROC
+ andl $~_TIF_CSTAR, TI_flags(%ebp)
+ jmp ret_from_fork
END(cstar_ret_from_fork)
#include <asm/unistd.h>
@@ -1429,52 +1285,45 @@ mask=0
#ifdef CONFIG_TRACING
ENTRY(trace_page_fault)
- RING0_EC_FRAME
ASM_CLAC
- pushl_cfi $trace_do_page_fault
- jmp error_code
- CFI_ENDPROC
+ pushl $trace_do_page_fault
+ jmp error_code
END(trace_page_fault)
#endif
ENTRY(page_fault)
- RING0_EC_FRAME
ASM_CLAC
- pushl_cfi $do_page_fault
+ pushl $do_page_fault
ALIGN
error_code:
/* the function address is in %gs's slot on the stack */
- pushl_cfi %fs
- /*CFI_REL_OFFSET fs, 0*/
- pushl_cfi %es
- /*CFI_REL_OFFSET es, 0*/
- pushl_cfi %ds
- /*CFI_REL_OFFSET ds, 0*/
- pushl_cfi_reg eax
- pushl_cfi_reg ebp
- pushl_cfi_reg edi
- pushl_cfi_reg esi
- pushl_cfi_reg edx
- pushl_cfi_reg ecx
- pushl_cfi_reg ebx
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
cld
- movl $(__KERNEL_PERCPU), %ecx
- movl %ecx, %fs
+ movl $(__KERNEL_PERCPU), %ecx
+ movl %ecx, %fs
UNWIND_ESPFIX_STACK
GS_TO_REG %ecx
- movl PT_GS(%esp), %edi # get the function address
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+ movl PT_GS(%esp), %edi # get the function address
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
REG_TO_PTGS %ecx
SET_KERNEL_GS %ecx
- movl $(__USER_DS), %ecx
- movl %ecx, %ds
- movl %ecx, %es
+ movl $(__USER_DS), %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
TRACE_IRQS_OFF
- movl %esp,%eax # pt_regs pointer
- call *%edi
- jmp ret_from_exception
- CFI_ENDPROC
+ movl %esp, %eax # pt_regs pointer
+ call *%edi
+ jmp ret_from_exception
END(page_fault)
#ifndef CONFIG_XEN
@@ -1492,36 +1341,31 @@ END(page_fault)
* the instruction that would have done it for sysenter.
*/
.macro FIX_STACK offset ok label
- cmpw $__KERNEL_CS, 4(%esp)
- jne \ok
+ cmpw $__KERNEL_CS, 4(%esp)
+ jne \ok
\label:
- movl TSS_sysenter_sp0 + \offset(%esp), %esp
- CFI_DEF_CFA esp, 0
- CFI_UNDEFINED eip
- pushfl_cfi
- pushl_cfi $__KERNEL_CS
- pushl_cfi $sysenter_past_esp
- CFI_REL_OFFSET eip, 0
+ movl TSS_sysenter_sp0 + \offset(%esp), %esp
+ pushfl
+ pushl $__KERNEL_CS
+ pushl $sysenter_past_esp
.endm
#endif /* CONFIG_XEN */
ENTRY(debug)
- RING0_INT_FRAME
ASM_CLAC
#ifndef CONFIG_XEN
- cmpl $ia32_sysenter_target,(%esp)
- jne debug_stack_correct
+ cmpl $entry_SYSENTER_32, (%esp)
+ jne debug_stack_correct
FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
debug_stack_correct:
#endif /* !CONFIG_XEN */
- pushl_cfi $-1 # mark this as an int
+ pushl $-1 # mark this as an int
SAVE_ALL
TRACE_IRQS_OFF
- xorl %edx,%edx # error code 0
- movl %esp,%eax # pt_regs pointer
- call do_debug
- jmp ret_from_exception
- CFI_ENDPROC
+ xorl %edx, %edx # error code 0
+ movl %esp, %eax # pt_regs pointer
+ call do_debug
+ jmp ret_from_exception
END(debug)
/*
@@ -1533,117 +1377,103 @@ END(debug)
* fault happened on the sysenter path.
*/
ENTRY(nmi)
- RING0_INT_FRAME
ASM_CLAC
#ifdef CONFIG_X86_ESPFIX32
- pushl_cfi %eax
- movl %ss, %eax
- cmpw $__ESPFIX_SS, %ax
- popl_cfi %eax
- je nmi_espfix_stack
+ pushl %eax
+ movl %ss, %eax
+ cmpw $__ESPFIX_SS, %ax
+ popl %eax
+ je nmi_espfix_stack
#endif
#ifndef CONFIG_XEN
- cmpl $ia32_sysenter_target,(%esp)
- je nmi_stack_fixup
- pushl_cfi %eax
- movl %esp,%eax
- /* Do not access memory above the end of our stack page,
+ cmpl $entry_SYSENTER_32, (%esp)
+ je nmi_stack_fixup
+ pushl %eax
+ movl %esp, %eax
+ /*
+ * Do not access memory above the end of our stack page,
* it might not exist.
*/
- andl $(THREAD_SIZE-1),%eax
- cmpl $(THREAD_SIZE-20),%eax
- popl_cfi %eax
- jae nmi_stack_correct
- cmpl $ia32_sysenter_target,12(%esp)
- je nmi_debug_stack_check
+ andl $(THREAD_SIZE-1), %eax
+ cmpl $(THREAD_SIZE-20), %eax
+ popl %eax
+ jae nmi_stack_correct
+ cmpl $entry_SYSENTER_32, 12(%esp)
+ je nmi_debug_stack_check
nmi_stack_correct:
- /* We have a RING0_INT_FRAME here */
- pushl_cfi %eax
+ pushl %eax
SAVE_ALL
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_nmi
- jmp restore_all_notrace
- CFI_ENDPROC
+ xorl %edx, %edx # zero error code
+ movl %esp, %eax # pt_regs pointer
+ call do_nmi
+ jmp restore_all_notrace
nmi_stack_fixup:
- RING0_INT_FRAME
FIX_STACK 12, nmi_stack_correct, 1
- jmp nmi_stack_correct
+ jmp nmi_stack_correct
nmi_debug_stack_check:
- /* We have a RING0_INT_FRAME here */
- cmpw $__KERNEL_CS,16(%esp)
- jne nmi_stack_correct
- cmpl $debug,(%esp)
- jb nmi_stack_correct
- cmpl $debug_esp_fix_insn,(%esp)
- ja nmi_stack_correct
+ cmpw $__KERNEL_CS, 16(%esp)
+ jne nmi_stack_correct
+ cmpl $debug, (%esp)
+ jb nmi_stack_correct
+ cmpl $debug_esp_fix_insn, (%esp)
+ ja nmi_stack_correct
FIX_STACK 24, nmi_stack_correct, 1
- jmp nmi_stack_correct
+ jmp nmi_stack_correct
#ifdef CONFIG_X86_ESPFIX32
nmi_espfix_stack:
- /* We have a RING0_INT_FRAME here.
- *
+ /*
* create the pointer to lss back
*/
- pushl_cfi %ss
- pushl_cfi %esp
- addl $4, (%esp)
+ pushl %ss
+ pushl %esp
+ addl $4, (%esp)
/* copy the iret frame of 12 bytes */
.rept 3
- pushl_cfi 16(%esp)
+ pushl 16(%esp)
.endr
- pushl_cfi %eax
+ pushl %eax
SAVE_ALL
- FIXUP_ESPFIX_STACK # %eax == %esp
- xorl %edx,%edx # zero error code
- call do_nmi
+ FIXUP_ESPFIX_STACK # %eax == %esp
+ xorl %edx, %edx # zero error code
+ call do_nmi
RESTORE_REGS
- lss 12+4(%esp), %esp # back to espfix stack
- CFI_ADJUST_CFA_OFFSET -24
- jmp irq_return
+ lss 12+4(%esp), %esp # back to espfix stack
+ jmp irq_return
#endif
#else /* CONFIG_XEN */
- pushl_cfi %eax
+ pushl %eax
SAVE_ALL
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_nmi
- orl $NMI_MASK, PT_EFLAGS(%esp)
- jmp restore_all
+ xorl %edx, %edx # zero error code
+ movl %esp, %eax # pt_regs pointer
+ call do_nmi
+ orl $NMI_MASK, PT_EFLAGS(%esp)
+ jmp restore_all
#endif
- CFI_ENDPROC
END(nmi)
ENTRY(int3)
- RING0_INT_FRAME
ASM_CLAC
- pushl_cfi $-1 # mark this as an int
+ pushl $-1 # mark this as an int
SAVE_ALL
TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_int3
- jmp ret_from_exception
- CFI_ENDPROC
+ xorl %edx, %edx # zero error code
+ movl %esp, %eax # pt_regs pointer
+ call do_int3
+ jmp ret_from_exception
END(int3)
ENTRY(general_protection)
- RING0_EC_FRAME
- pushl_cfi $do_general_protection
- jmp error_code
- CFI_ENDPROC
+ pushl $do_general_protection
+ jmp error_code
END(general_protection)
#ifdef CONFIG_KVM_GUEST
ENTRY(async_page_fault)
- RING0_EC_FRAME
ASM_CLAC
- pushl_cfi $do_async_page_fault
- jmp error_code
- CFI_ENDPROC
+ pushl $do_async_page_fault
+ jmp error_code
END(async_page_fault)
#endif
-
--- a/arch/x86/entry/entry_64-xen.S
+++ b/arch/x86/entry/entry_64-xen.S
@@ -7,34 +7,25 @@
* Jun Nakajima <jun.nakajima@intel.com>
* Asit Mallick <asit.k.mallick@intel.com>
* Modified for Xen
- */
-
-/*
+ *
* entry.S contains the system-call and fault low-level handling routines.
*
* Some of this is documented in Documentation/x86/entry_64.txt
*
- * NOTE: This code handles signal-recognition, which happens every time
- * after an interrupt and after each system call.
- *
* A note on terminology:
- * - iret frame: Architecture defined interrupt frame from SS to RIP
- * at the top of the kernel process stack.
+ * - iret frame: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
*
* Some macro usage:
- * - CFI macros are used to generate dwarf2 unwind information for better
- * backtraces. They don't change any code.
- * - ENTRY/END Define functions in the symbol table.
- * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - idtentry - Define exception entry points.
+ * - ENTRY/END: Define functions in the symbol table.
+ * - TRACE_IRQ_*: Trace hardirq state for lock debugging.
+ * - idtentry: Define exception entry points.
*/
-
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/cache.h>
#include <asm/errno.h>
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
+#include "calling.h"
#include <asm/asm-offsets.h>
#include <asm/msr.h>
#include <asm/unistd.h>
@@ -54,18 +45,17 @@
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
-#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_64BIT 0x80000000
-#define __AUDIT_ARCH_LE 0x40000000
-
- .code64
- .section .entry.text, "ax"
+#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_64BIT 0x80000000
+#define __AUDIT_ARCH_LE 0x40000000
+.code64
+.section .entry.text, "ax"
.macro TRACE_IRQS_IRETQ
#ifdef CONFIG_TRACE_IRQFLAGS
- bt $9,EFLAGS(%rsp) /* interrupts off? */
- jnc 1f
+ bt $9, EFLAGS(%rsp) /* interrupts off? */
+ jnc 1f
TRACE_IRQS_ON
1:
#endif
@@ -85,135 +75,65 @@
#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
.macro TRACE_IRQS_OFF_DEBUG
- call debug_stack_set_zero
+ call debug_stack_set_zero
TRACE_IRQS_OFF
- call debug_stack_reset
+ call debug_stack_reset
.endm
.macro TRACE_IRQS_ON_DEBUG
- call debug_stack_set_zero
+ call debug_stack_set_zero
TRACE_IRQS_ON
- call debug_stack_reset
+ call debug_stack_reset
.endm
.macro TRACE_IRQS_IRETQ_DEBUG
- bt $9,EFLAGS(%rsp) /* interrupts off? */
- jnc 1f
+ bt $9, EFLAGS(%rsp) /* interrupts off? */
+ jnc 1f
TRACE_IRQS_ON_DEBUG
1:
.endm
#else
-# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
-# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
-# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
+# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
#endif
NMI_MASK = 0x80000000
-
-/*
- * empty frame
- */
- .macro EMPTY_FRAME start=1 offset=0
- .if \start
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,8+\offset
- .else
- CFI_DEF_CFA_OFFSET 8+\offset
- .endif
- .endm
-
-/*
- * initial frame state for syscall
- */
- .macro BASIC_FRAME start=1 offset=0
- EMPTY_FRAME \start, 5*8+\offset
- /*CFI_REL_OFFSET ss, 4*8+\offset*/
- CFI_REL_OFFSET rsp, 3*8+\offset
- /*CFI_REL_OFFSET rflags, 2*8+\offset*/
- /*CFI_REL_OFFSET cs, 1*8+\offset*/
- CFI_REL_OFFSET rip, 0*8+\offset
- .endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
- */
- .macro INTR_FRAME start=1 offset=0
- .if \start == 1
- BASIC_FRAME 1, \offset+2*8
- CFI_REL_OFFSET rcx, 0+\offset
- CFI_REL_OFFSET r11, 8+\offset
- .else
- BASIC_FRAME \start, \offset
- .endif
- .endm
-
-/*
- * initial frame state for exceptions with error code (and interrupts
- * with vector already pushed)
- */
- .macro XCPT_FRAME start=1 offset=0
- INTR_FRAME \start, 1*8+\offset
- .endm
-
-/*
- * frame that enables passing a complete pt_regs to a C function.
- */
- .macro DEFAULT_FRAME start=1 offset=0 extra=1
- XCPT_FRAME -(\start), ORIG_RAX+\offset
- CFI_REL_OFFSET rdi, RDI+\offset
- CFI_REL_OFFSET rsi, RSI+\offset
- CFI_REL_OFFSET rdx, RDX+\offset
- CFI_REL_OFFSET rcx, RCX+\offset
- CFI_REL_OFFSET rax, RAX+\offset
- CFI_REL_OFFSET r8, R8+\offset
- CFI_REL_OFFSET r9, R9+\offset
- CFI_REL_OFFSET r10, R10+\offset
- CFI_REL_OFFSET r11, R11+\offset
- .if \extra
- CFI_REL_OFFSET rbx, RBX+\offset
- CFI_REL_OFFSET rbp, RBP+\offset
- CFI_REL_OFFSET r12, R12+\offset
- CFI_REL_OFFSET r13, R13+\offset
- CFI_REL_OFFSET r14, R14+\offset
- CFI_REL_OFFSET r15, R15+\offset
- .endif
- .endm
- /*
- * Must be consistent with the definition in arch-x86/xen-x86_64.h:
- * struct iret_context {
- * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
- * };
- * with rax, r11, and rcx being taken care of in the hypercall stub.
- */
+ /*
+ * Must be consistent with the definition in arch-x86/xen-x86_64.h:
+ * struct iret_context {
+ * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+ * };
+ * with rax, r11, and rcx being taken care of in the hypercall stub.
+ */
.macro HYPERVISOR_IRET flag
.if \flag == 0 # return from syscall always uses the hypercall
- testb $3,1*8(%rsp)
- jnz 2f
- testl $NMI_MASK,2*8(%rsp)
- jnz 2f
+ testb $3, 1*8(%rsp)
+ jnz 2f
+ testl $NMI_MASK, 2*8(%rsp)
+ jnz 2f
- cmpb $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip)
- jne 1f
+ cmpb $0, xen_features+XENFEAT_supervisor_mode_kernel(%rip)
+ jne 1f
/* Direct iret to kernel space. Correct CS and SS. */
- orl $3,1*8(%rsp)
- orl $3,4*8(%rsp)
+ orl $3, 1*8(%rsp)
+ orl $3, 4*8(%rsp)
1: iretq
.endif
2: /* Slow iret via hypervisor. */
- andl $~NMI_MASK, 2*8(%rsp)
- pushq $\flag & VGCF_in_syscall
- jmp hypercall_page + (__HYPERVISOR_iret * 32)
+ andl $~NMI_MASK, 2*8(%rsp)
+ pushq $\flag & VGCF_in_syscall
+ jmp hypercall_page + (__HYPERVISOR_iret * 32)
.endm
/*
- * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
+ * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
- * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
* are not needed). SYSCALL does not save anything on the stack
@@ -229,7 +149,7 @@ NMI_MASK = 0x80000000
* r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
- * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
+ * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
*
* Only called from user space.
*
@@ -238,40 +158,38 @@ NMI_MASK = 0x80000000
* with them due to bugs in both AMD and Intel CPUs.
*/
-ENTRY(system_call)
- INTR_FRAME start=2 offset=2*8
+ENTRY(entry_SYSCALL_64)
/*
* Interrupts are enabled on entry.
*/
/* Construct struct pt_regs on stack */
- movl $__USER_DS,6*8(%rsp) /* pt_regs->ss */
- movl $__USER_CS,3*8(%rsp) /* pt_regs->cs */
- movq_cfi rax,8 /* pt_regs->orig_ax */
- movq_cfi rdi,0 /* pt_regs->di */
- pushq_cfi_reg rsi /* pt_regs->si */
- pushq_cfi_reg rdx /* pt_regs->dx */
- pushq_cfi_reg rcx /* pt_regs->cx */
- pushq_cfi $-ENOSYS /* pt_regs->ax */
- pushq_cfi_reg r8 /* pt_regs->r8 */
- pushq_cfi_reg r9 /* pt_regs->r9 */
- pushq_cfi_reg r10 /* pt_regs->r10 */
- pushq_cfi_reg r11 /* pt_regs->r11 */
- sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
- CFI_ADJUST_CFA_OFFSET 6*8
-
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz tracesys
-system_call_fastpath:
+ movl $__USER_DS, 6*8(%rsp) /* pt_regs->ss */
+ movl $__USER_CS, 3*8(%rsp) /* pt_regs->cs */
+ movq %rax, 8(%rsp) /* pt_regs->orig_ax */
+ movq %rdi, 0(%rsp) /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq $-ENOSYS /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
+
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz tracesys
+entry_SYSCALL_64_fastpath:
#if __SYSCALL_MASK == ~0
- cmpq $__NR_syscall_max,%rax
+ cmpq $__NR_syscall_max, %rax
#else
- andl $__SYSCALL_MASK,%eax
- cmpl $__NR_syscall_max,%eax
+ andl $__SYSCALL_MASK, %eax
+ cmpl $__NR_syscall_max, %eax
#endif
- ja 1f /* return -ENOSYS (already in pt_regs->ax) */
- movq %r10,%rcx
- call *sys_call_table(,%rax,8)
- movq %rax,RAX(%rsp)
+ ja 1f /* return -ENOSYS (already in pt_regs->ax) */
+ movq %r10, %rcx
+ call *sys_call_table(, %rax, 8)
+ movq %rax, RAX(%rsp)
1:
/*
* Syscall return path ending with SYSRET (fast path).
@@ -292,36 +210,32 @@ system_call_fastpath:
* flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
* very bad.
*/
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
-
- CFI_REMEMBER_STATE
+ testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
RESTORE_C_REGS_EXCEPT_RCX_R11
REMOVE_PT_GPREGS_FROM_STACK 8
- xor %ecx,%ecx
- xor %r11,%r11
- HYPERVISOR_IRET VGCF_IN_SYSCALL
-
- CFI_RESTORE_STATE
+ xor %ecx, %ecx
+ xor %r11, %r11
+ HYPERVISOR_IRET VGCF_IN_SYSCALL
/* Do syscall entry tracing */
tracesys:
- movq %rsp, %rdi
- movl $AUDIT_ARCH_X86_64, %esi
- call syscall_trace_enter_phase1
- test %rax, %rax
- jnz tracesys_phase2 /* if needed, run the slow path */
- RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
- movq ORIG_RAX(%rsp), %rax
- jmp system_call_fastpath /* and return to the fast path */
+ movq %rsp, %rdi
+ movl $AUDIT_ARCH_X86_64, %esi
+ call syscall_trace_enter_phase1
+ test %rax, %rax
+ jnz tracesys_phase2 /* if needed, run the slow path */
+ RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
+ movq ORIG_RAX(%rsp), %rax
+ jmp entry_SYSCALL_64_fastpath /* and return to the fast path */
tracesys_phase2:
SAVE_EXTRA_REGS
- movq %rsp, %rdi
- movl $AUDIT_ARCH_X86_64, %esi
- movq %rax,%rdx
- call syscall_trace_enter_phase2
+ movq %rsp, %rdi
+ movl $AUDIT_ARCH_X86_64, %esi
+ movq %rax, %rdx
+ call syscall_trace_enter_phase2
/*
* Reload registers from stack in case ptrace changed them.
@@ -331,15 +245,15 @@ tracesys_phase2:
RESTORE_C_REGS_EXCEPT_RAX
RESTORE_EXTRA_REGS
#if __SYSCALL_MASK == ~0
- cmpq $__NR_syscall_max,%rax
+ cmpq $__NR_syscall_max, %rax
#else
- andl $__SYSCALL_MASK,%eax
- cmpl $__NR_syscall_max,%eax
+ andl $__SYSCALL_MASK, %eax
+ cmpl $__NR_syscall_max, %eax
#endif
- ja 1f /* return -ENOSYS (already in pt_regs->ax) */
- movq %r10,%rcx /* fixup for C */
- call *sys_call_table(,%rax,8)
- movq %rax,RAX(%rsp)
+ ja 1f /* return -ENOSYS (already in pt_regs->ax) */
+ movq %r10, %rcx /* fixup for C */
+ call *sys_call_table(, %rax, 8)
+ movq %rax, RAX(%rsp)
1:
/* Use IRET because user could have changed pt_regs->foo */
@@ -351,31 +265,33 @@ GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
TRACE_IRQS_OFF
- movl $_TIF_ALLWORK_MASK,%edi
+ movl $_TIF_ALLWORK_MASK, %edi
/* edi: mask to check */
GLOBAL(int_with_check)
LOCKDEP_SYS_EXIT_IRQ
GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%edx
- andl %edi,%edx
- jnz int_careful
- andl $~TS_COMPAT,TI_status(%rcx)
+ movl TI_flags(%rcx), %edx
+ andl %edi, %edx
+ jnz int_careful
+ andl $~TS_COMPAT, TI_status(%rcx)
jmp restore_c_regs_and_iret
- /* Either reschedule or signal or syscall exit tracking needed. */
- /* First do a reschedule test. */
- /* edx: work, edi: workmask */
+ /*
+ * Either reschedule or signal or syscall exit tracking needed.
+ * First do a reschedule test.
+ * edx: work, edi: workmask
+ */
int_careful:
- bt $TIF_NEED_RESCHED,%edx
- jnc int_very_careful
+ bt $TIF_NEED_RESCHED, %edx
+ jnc int_very_careful
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq_cfi %rdi
+ pushq %rdi
SCHEDULE_USER
- popq_cfi %rdi
+ popq %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- jmp int_with_check
+ jmp int_with_check
/* handle signals and tracing -- both require a full pt_regs */
int_very_careful:
@@ -383,38 +299,34 @@ int_very_careful:
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_EXTRA_REGS
/* Check for syscall exit trace */
- testl $_TIF_WORK_SYSCALL_EXIT,%edx
- jz int_signal
- pushq_cfi %rdi
- leaq 8(%rsp),%rdi # &ptregs -> arg1
- call syscall_trace_leave
- popq_cfi %rdi
- andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
- jmp int_restore_rest
+ testl $_TIF_WORK_SYSCALL_EXIT, %edx
+ jz int_signal
+ pushq %rdi
+ leaq 8(%rsp), %rdi /* &ptregs -> arg1 */
+ call syscall_trace_leave
+ popq %rdi
+ andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi
+ jmp int_restore_rest
int_signal:
- testl $_TIF_DO_NOTIFY_MASK,%edx
- jz 1f
- movq %rsp,%rdi # &ptregs -> arg1
- xorl %esi,%esi # oldset -> arg2
- call do_notify_resume
-1: movl $_TIF_WORK_MASK,%edi
+ testl $_TIF_DO_NOTIFY_MASK, %edx
+ jz 1f
+ movq %rsp, %rdi /* &ptregs -> arg1 */
+ xorl %esi, %esi /* oldset -> arg2 */
+ call do_notify_resume
+1: movl $_TIF_WORK_MASK, %edi
int_restore_rest:
RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- jmp int_with_check
- CFI_ENDPROC
-END(system_call)
+ jmp int_with_check
+END(entry_SYSCALL_64)
.macro FORK_LIKE func
ENTRY(stub_\func)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8, 0 /* offset 8: return address */
SAVE_EXTRA_REGS 8
- jmp sys_\func
- CFI_ENDPROC
+ jmp sys_\func
END(stub_\func)
.endm
@@ -423,8 +335,6 @@ END(stub_\func)
FORK_LIKE vfork
ENTRY(stub_execve)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8, 0
call sys_execve
return_from_execve:
testl %eax, %eax
@@ -434,11 +344,9 @@ return_from_execve:
1:
/* must use IRET code path (pt_regs->cs may have changed) */
addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
ZERO_EXTRA_REGS
- movq %rax,RAX(%rsp)
+ movq %rax, RAX(%rsp)
jmp int_ret_from_sys_call
- CFI_ENDPROC
END(stub_execve)
/*
* Remaining execve stubs are only 7 bytes long.
@@ -446,47 +354,25 @@ END(stub_execve)
*/
.align 8
GLOBAL(stub_execveat)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8, 0
call sys_execveat
jmp return_from_execve
- CFI_ENDPROC
END(stub_execveat)
-#ifdef CONFIG_X86_X32_ABI
+#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
.align 8
GLOBAL(stub_x32_execve)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8, 0
- call compat_sys_execve
- jmp return_from_execve
- CFI_ENDPROC
-END(stub_x32_execve)
- .align 8
-GLOBAL(stub_x32_execveat)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8, 0
- call compat_sys_execveat
- jmp return_from_execve
- CFI_ENDPROC
-END(stub_x32_execveat)
-#endif
-
-#ifdef CONFIG_IA32_EMULATION
- .align 8
GLOBAL(stub32_execve)
- CFI_STARTPROC
call compat_sys_execve
jmp return_from_execve
- CFI_ENDPROC
END(stub32_execve)
+END(stub_x32_execve)
.align 8
+GLOBAL(stub_x32_execveat)
GLOBAL(stub32_execveat)
- CFI_STARTPROC
call compat_sys_execveat
jmp return_from_execve
- CFI_ENDPROC
END(stub32_execveat)
+END(stub_x32_execveat)
#endif
/*
@@ -494,8 +380,6 @@ END(stub32_execveat)
* This cannot be done with SYSRET, so use the IRET return path instead.
*/
ENTRY(stub_rt_sigreturn)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8, 0
/*
* SAVE_EXTRA_REGS result is not normally needed:
* sigreturn overwrites all pt_regs->GPREGS.
@@ -504,24 +388,19 @@ ENTRY(stub_rt_sigreturn)
* we SAVE_EXTRA_REGS here.
*/
SAVE_EXTRA_REGS 8
- call sys_rt_sigreturn
+ call sys_rt_sigreturn
return_from_stub:
addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
RESTORE_EXTRA_REGS
- movq %rax,RAX(%rsp)
- jmp int_ret_from_sys_call
- CFI_ENDPROC
+ movq %rax, RAX(%rsp)
+ jmp int_ret_from_sys_call
END(stub_rt_sigreturn)
#ifdef CONFIG_X86_X32_ABI
ENTRY(stub_x32_rt_sigreturn)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8, 0
SAVE_EXTRA_REGS 8
- call sys32_x32_rt_sigreturn
- jmp return_from_stub
- CFI_ENDPROC
+ call sys32_x32_rt_sigreturn
+ jmp return_from_stub
END(stub_x32_rt_sigreturn)
#endif
@@ -531,53 +410,55 @@ END(stub_x32_rt_sigreturn)
* rdi: prev task we switched from
*/
ENTRY(ret_from_fork)
- DEFAULT_FRAME
- LOCK ; btr $TIF_FORK,TI_flags(%r8)
+ LOCK ; btr $TIF_FORK, TI_flags(%r8)
- pushq_cfi $0x0002
- popfq_cfi # reset kernel eflags
+ pushq $0x0002
+ popfq /* reset kernel eflags */
- call schedule_tail # rdi: 'prev' task parameter
+ call schedule_tail /* rdi: 'prev' task parameter */
RESTORE_EXTRA_REGS
- testl $3,CS(%rsp) # from kernel_thread?
+ testb $3, CS(%rsp) /* from kernel_thread? */
/*
* By the time we get here, we have no idea whether our pt_regs,
* ti flags, and ti status came from the 64-bit SYSCALL fast path,
- * the slow path, or one of the ia32entry paths.
+ * the slow path, or one of the 32-bit compat paths.
* Use IRET code path to return, since it can safely handle
* all of the above.
*/
jnz int_ret_from_sys_call
- /* We came from kernel_thread */
- /* Need to set the proper %ss (not NULL) for ring 3 iretq */
- movl $__KERNEL_DS, SS(%rsp)
+ /*
+ * We came from kernel_thread
+ * Need to set the proper %ss (not NULL) for ring 3 iretq
+ */
+ movl $__KERNEL_DS, SS(%rsp)
/* nb: we depend on RESTORE_EXTRA_REGS above */
- movq %rbp, %rdi
- call *%rbx
- movl $0, RAX(%rsp)
+ movq %rbp, %rdi
+ call *%rbx
+ movl $0, RAX(%rsp)
RESTORE_EXTRA_REGS
- jmp int_ret_from_sys_call
- CFI_ENDPROC
+ jmp int_ret_from_sys_call
END(ret_from_fork)
/*
* Interrupt exit.
- */
+ */
+ /* Interrupt came from user space */
+retint_user:
+ GET_THREAD_INFO(%rcx)
+ /* %rcx: thread info. Interrupts are off. */
retint_with_reschedule:
- DEFAULT_FRAME extra=0
- movl $_TIF_WORK_MASK,%edi
+ movl $_TIF_WORK_MASK, %edi
retint_check:
LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- andl %edi,%edx
- CFI_REMEMBER_STATE
- jnz retint_careful
+ movl TI_flags(%rcx), %edx
+ andl %edi, %edx
+ jnz retint_careful
jmp restore_c_regs_and_iret
/* Returning to kernel space */
@@ -585,9 +466,9 @@ retint_kernel:
#ifdef CONFIG_PREEMPT
/* Interrupts are off */
/* Check if we need preemption */
- bt $9,EFLAGS(%rsp) /* interrupts were off? */
+ bt $9, EFLAGS(%rsp) /* were interrupts off? */
jnc 1f
-0: cmpl $0,PER_CPU_VAR(__preempt_count)
+0: cmpl $0, PER_CPU_VAR(__preempt_count)
jnz 1f
call preempt_schedule_irq
jmp 0b
@@ -599,12 +480,12 @@ retint_kernel:
* which come from interrupts/exception and from syscalls, merge.
*/
restore_c_regs_and_iret:
- movl EFLAGS(%rsp), %eax
- shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
+ movl EFLAGS(%rsp), %eax
+ shr $9, %eax /* EAX[0] == IRET_EFLAGS.IF */
GET_VCPU_INFO
- andb evtchn_upcall_mask(%rsi),%al
- andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
- jnz restore_all_enable_events # != 0 => enable event delivery
+ andb evtchn_upcall_mask(%rsi), %al
+ andb $1, %al /* EAX[0] == IRET_EFLAGS.IF & event_mask */
+ jnz restore_all_enable_events /* != 0 => enable event delivery */
.Lrestore_c_regs_and_iret:
/*
* The iretq could re-enable interrupts:
@@ -613,39 +494,37 @@ restore_c_regs_and_iret:
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
HYPERVISOR_IRET 0
-
+
/* edi: workmask, edx: work */
retint_careful:
- CFI_RESTORE_STATE
- bt $TIF_NEED_RESCHED,%edx
- jnc retint_signal
+ bt $TIF_NEED_RESCHED, %edx
+ jnc retint_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq_cfi %rdi
+ pushq %rdi
SCHEDULE_USER
- popq_cfi %rdi
+ popq %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- jmp retint_check
+ jmp retint_check
retint_signal:
- testl $_TIF_DO_NOTIFY_MASK,%edx
- jz restore_c_regs_and_iret
+ testl $_TIF_DO_NOTIFY_MASK, %edx
+ jz restore_c_regs_and_iret
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_EXTRA_REGS
- movq $-1,ORIG_RAX(%rsp)
- xorl %esi,%esi # oldset
- movq %rsp,%rdi # &pt_regs
- call do_notify_resume
+ movq $-1, ORIG_RAX(%rsp)
+ xorl %esi, %esi /* oldset */
+ movq %rsp, %rdi /* &pt_regs */
+ call do_notify_resume
RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
- jmp retint_with_reschedule
+ jmp retint_with_reschedule
- CFI_ENDPROC
END(retint_check)
/*
@@ -660,102 +539,89 @@ ENTRY(\sym)
.error "using shift_ist requires paranoid=1"
.endif
- .if \has_error_code
- XCPT_FRAME
- .else
- INTR_FRAME
- .endif
-
ASM_CLAC
- movq_cfi_restore 0,rcx
- movq_cfi_restore 8,r11
+ movq 0(%rsp), %rcx
+ movq 8(%rsp), %r11
.if \has_error_code
ALLOC_PT_GPREGS_ON_STACK -2*8
.else
- movq $-1,8(%rsp) /* ORIG_RAX: no syscall to restart */
+ movq $-1, 8(%rsp) /* ORIG_RAX: no syscall to restart */
ALLOC_PT_GPREGS_ON_STACK -1*8
.endif
.if \paranoid
.if \paranoid == 1
- CFI_REMEMBER_STATE
- testl $3, CS(%rsp) /* If coming from userspace, switch */
- jnz 1f /* stacks. */
+ testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
+ jnz 1f
.endif
- call paranoid_entry
+ call paranoid_entry
.else
- call error_entry
+ call error_entry
.endif
/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
- DEFAULT_FRAME 0
-
.if \paranoid
.if \shift_ist != -1
- TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
+ TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
.else
TRACE_IRQS_OFF
.endif
.endif
- movq %rsp,%rdi /* pt_regs pointer */
+ movq %rsp, %rdi /* pt_regs pointer */
.if \has_error_code
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ movq ORIG_RAX(%rsp), %rsi /* get error code */
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
.else
- xorl %esi,%esi /* no error code */
+ xorl %esi, %esi /* no error code */
.endif
.if \shift_ist != -1
- subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+ subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
.endif
- call \do_sym
+ call \do_sym
.if \shift_ist != -1
- addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+ addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
.endif
/* these procedures expect "no swapgs" flag in ebx */
.if \paranoid
- jmp paranoid_exit
+ jmp paranoid_exit
.else
- jmp error_exit
+ jmp error_exit
.endif
.if \paranoid == 1
- CFI_RESTORE_STATE
/*
* Paranoid entry from userspace. Switch stacks and treat it
* as a normal entry. This means that paranoid handlers
* run in real process context if user_mode(regs).
*/
1:
- call error_entry
+ call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- call sync_regs
- movq %rax,%rsp /* switch stack */
+ movq %rsp, %rdi /* pt_regs pointer */
+ call sync_regs
+ movq %rax, %rsp /* switch stack */
- movq %rsp,%rdi /* pt_regs pointer */
+ movq %rsp, %rdi /* pt_regs pointer */
.if \has_error_code
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ movq ORIG_RAX(%rsp), %rsi /* get error code */
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
.else
- xorl %esi,%esi /* no error code */
+ xorl %esi, %esi /* no error code */
.endif
- call \do_sym
+ call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
+ jmp error_exit /* %ebx: no swapgs flag */
.endif
-
- CFI_ENDPROC
END(\sym)
.endm
@@ -785,41 +651,33 @@ idtentry \sym \do_sym has_error_code=\ha
# existing activation in its critical region -- if so, we pop the current
# activation and restart the handler using the previous one.
ENTRY(do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
- CFI_STARTPROC
# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
# see the correct pointer to the pt_regs
- movq %rdi, %rsp # we don't return, adjust the stack frame
- CFI_ENDPROC
- DEFAULT_FRAME
-11: incl PER_CPU_VAR(irq_count)
- movq %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
- pushq %rbp # backlink for old unwinder
- call evtchn_do_upcall
- popq %rsp
- CFI_DEF_CFA_REGISTER rsp
- decl PER_CPU_VAR(irq_count)
- jmp error_exit
- CFI_ENDPROC
+ movq %rdi, %rsp # we don't return, adjust the stack frame
+11: incl PER_CPU_VAR(irq_count)
+ movq %rsp, %rbp
+ cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
+ pushq %rbp # backlink for old unwinder
+ call evtchn_do_upcall
+ popq %rsp
+ decl PER_CPU_VAR(irq_count)
+ jmp error_exit
END(do_hypervisor_callback)
ALIGN
restore_all_enable_events:
- DEFAULT_FRAME extra=0
TRACE_IRQS_ON
__ENABLE_INTERRUPTS
.Lscrit: /**** START OF CRITICAL REGION ****/
__TEST_PENDING
- jz .Lrestore_c_regs_and_iret
+ jz .Lrestore_c_regs_and_iret
__DISABLE_INTERRUPTS
.Lecrit: /**** END OF CRITICAL REGION ****/
SAVE_EXTRA_REGS
- movq %rsp,%rdi # set the argument again
- jmp 11b
- CFI_ENDPROC
+ movq %rsp, %rdi # set the argument again
+ jmp 11b
# At this point, unlike on x86-32, we don't do the fixup to simplify the
# code and the stack frame is more complex on x86-64.
# When the kernel is interrupted in the critical section, the kernel
@@ -838,85 +696,75 @@ restore_all_enable_events:
# We distinguish between categories by comparing each saved segment register
# with its current contents: any discrepancy means we in category 1.
ENTRY(failsafe_callback)
- INTR_FRAME offset=4*8
ASM_CLAC
- movw %ds,%cx
- cmpw %cx,0x10(%rsp)
- CFI_REMEMBER_STATE
- jne 1f
- movw %es,%cx
- cmpw %cx,0x18(%rsp)
- jne 1f
- movw %fs,%cx
- cmpw %cx,0x20(%rsp)
- jne 1f
- movw %gs,%cx
- cmpw %cx,0x28(%rsp)
- jne 1f
+ movw %ds, %cx
+ cmpw %cx, 0x10(%rsp)
+ jne 1f
+ movw %es, %cx
+ cmpw %cx, 0x18(%rsp)
+ jne 1f
+ movw %fs, %cx
+ cmpw %cx, 0x20(%rsp)
+ jne 1f
+ movw %gs, %cx
+ cmpw %cx, 0x28(%rsp)
+ jne 1f
/* All segments match their saved values => Category 2 (Bad IRET). */
- movq_cfi_restore 0,rcx
- movq_cfi_restore 8,r11
- addq $0x30,%rsp
- CFI_ADJUST_CFA_OFFSET -0x30
- movq $11,%rdi /* SIGSEGV */
- jmp do_exit
- CFI_RESTORE_STATE
+ movq 0(%rsp), %rcx
+ movq 8(%rsp), %r11
+ addq $0x30, %rsp
+ movq $11, %rdi /* SIGSEGV */
+ jmp do_exit
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
- movq_cfi_restore 0,rcx
- movq_cfi_restore 8,r11
- addq $0x30,%rsp
- CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $-1
+ movq 0(%rsp), %rcx
+ movq 8(%rsp), %r11
+ addq $0x30, %rsp
+ pushq $-1
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
- jmp error_exit
- CFI_ENDPROC
+ jmp error_exit
-idtentry divide_error do_divide_error has_error_code=0
-idtentry overflow do_overflow has_error_code=0
-idtentry bounds do_bounds has_error_code=0
-idtentry invalid_op do_invalid_op has_error_code=0
-idtentry device_not_available do_device_not_available has_error_code=0
-idtentry hypervisor_callback do_hypervisor_callback has_error_code=0
-idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
-idtentry invalid_TSS do_invalid_TSS has_error_code=1
-idtentry segment_not_present do_segment_not_present has_error_code=1
-idtentry coprocessor_error do_coprocessor_error has_error_code=0
-idtentry alignment_check do_alignment_check has_error_code=1
-idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
+idtentry divide_error do_divide_error has_error_code=0
+idtentry overflow do_overflow has_error_code=0
+idtentry bounds do_bounds has_error_code=0
+idtentry invalid_op do_invalid_op has_error_code=0
+idtentry device_not_available do_device_not_available has_error_code=0
+idtentry hypervisor_callback do_hypervisor_callback has_error_code=0
+idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
+idtentry invalid_TSS do_invalid_TSS has_error_code=1
+idtentry segment_not_present do_segment_not_present has_error_code=1
+idtentry coprocessor_error do_coprocessor_error has_error_code=0
+idtentry alignment_check do_alignment_check has_error_code=1
+idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(do_softirq_own_stack)
- CFI_STARTPROC
- pushq_cfi %rbp
- CFI_REL_OFFSET rbp,0
- mov %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- incl PER_CPU_VAR(irq_count)
- cmove PER_CPU_VAR(irq_stack_ptr),%rsp
- push %rbp # backlink for old unwinder
- call __do_softirq
+ pushq %rbp
+ mov %rsp, %rbp
+ incl PER_CPU_VAR(irq_count)
+ cmove PER_CPU_VAR(irq_stack_ptr), %rsp
+ push %rbp /* frame pointer backlink */
+ call __do_softirq
leaveq
- CFI_RESTORE rbp
- CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET -8
- decl PER_CPU_VAR(irq_count)
+ decl PER_CPU_VAR(irq_count)
ret
- CFI_ENDPROC
END(do_softirq_own_stack)
-idtentry debug do_debug has_error_code=0
-idtentry nmi do_nmi_callback has_error_code=0
-idtentry int3 do_int3 has_error_code=0
-idtentry stack_segment do_stack_segment has_error_code=1
-idtentry general_protection do_general_protection has_error_code=1
-trace_idtentry page_fault do_page_fault has_error_code=1
+idtentry debug do_debug has_error_code=0
+idtentry nmi do_nmi_callback has_error_code=0
+idtentry int3 do_int3 has_error_code=0
+idtentry stack_segment do_stack_segment has_error_code=1
+
+idtentry general_protection do_general_protection has_error_code=1
+trace_idtentry page_fault do_page_fault has_error_code=1
+
#ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault do_async_page_fault has_error_code=1
+idtentry async_page_fault do_async_page_fault has_error_code=1
#endif
+
#ifdef CONFIG_X86_MCE
-idtentry machine_check has_error_code=0 do_sym=*machine_check_vector(%rip)
+idtentry machine_check has_error_code=0 do_sym=*machine_check_vector(%rip)
#endif
#ifndef CONFIG_XEN
@@ -926,19 +774,17 @@ idtentry machine_check has_error_code=0
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(paranoid_entry)
- XCPT_FRAME 1 15*8
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
+ movl $1, %ebx
+ movl $MSR_GS_BASE, %ecx
rdmsr
- testl %edx,%edx
- js 1f /* negative -> in kernel */
+ testl %edx, %edx
+ js 1f /* negative -> in kernel */
SWAPGS
- xorl %ebx,%ebx
+ xorl %ebx, %ebx
1: ret
- CFI_ENDPROC
END(paranoid_entry)
/*
@@ -950,17 +796,17 @@ END(paranoid_entry)
* in syscall entry), so checking for preemption here would
* be complicated. Fortunately, we there's no good reason
* to try to handle preemption here.
+ *
+ * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
*/
-/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
ENTRY(paranoid_exit)
- DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF_DEBUG
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_exit_no_swapgs
+ testl %ebx, %ebx /* swapgs needed? */
+ jnz paranoid_exit_no_swapgs
TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
- jmp paranoid_exit_restore
+ jmp paranoid_exit_restore
paranoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
paranoid_exit_restore:
@@ -968,26 +814,26 @@ paranoid_exit_restore:
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
INTERRUPT_RETURN
- CFI_ENDPROC
END(paranoid_exit)
#endif
/*
* Save all registers in pt_regs, and switch gs if needed.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ * Return: EBX=0: came from user mode; EBX=1: otherwise
*/
ENTRY(error_entry)
- XCPT_FRAME 2 15*8
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
#ifndef CONFIG_XEN
- xorl %ebx,%ebx
- testl $3,CS+8(%rsp)
- je error_kernelspace
-error_swapgs:
+ xorl %ebx, %ebx
+ testb $3, CS+8(%rsp)
+ jz error_kernelspace
+
+ /* We entered from user mode */
SWAPGS
-error_sti:
+
+error_entry_done:
#endif
TRACE_IRQS_OFF
ret
@@ -1000,53 +846,59 @@ error_sti:
* for these here too.
*/
error_kernelspace:
- CFI_REL_OFFSET rcx, RCX+8
- incl %ebx
- leaq native_irq_return_iret(%rip),%rcx
- cmpq %rcx,RIP+8(%rsp)
- je error_bad_iret
- movl %ecx,%eax /* zero extend */
- cmpq %rax,RIP+8(%rsp)
- je bstep_iret
- cmpq $gs_change,RIP+8(%rsp)
- je error_swapgs
- jmp error_sti
+ incl %ebx
+ leaq native_irq_return_iret(%rip), %rcx
+ cmpq %rcx, RIP+8(%rsp)
+ je error_bad_iret
+ movl %ecx, %eax /* zero extend */
+ cmpq %rax, RIP+8(%rsp)
+ je bstep_iret
+ cmpq $gs_change, RIP+8(%rsp)
+ jne error_entry_done
+
+ /*
+ * hack: gs_change can fail with user gsbase. If this happens, fix up
+ * gsbase and proceed. We'll fix up the exception and land in
+ * gs_change's error handler with kernel gsbase.
+ */
+ SWAPGS
+ jmp error_entry_done
bstep_iret:
/* Fix truncated RIP */
- movq %rcx,RIP+8(%rsp)
+ movq %rcx, RIP+8(%rsp)
/* fall through */
error_bad_iret:
+ /*
+ * We came from an IRET to user mode, so we have user gsbase.
+ * Switch to kernel gsbase:
+ */
SWAPGS
- mov %rsp,%rdi
- call fixup_bad_iret
- mov %rax,%rsp
- decl %ebx /* Return to usergs */
- jmp error_sti
+
+ /*
+ * Pretend that the exception came from user mode: set up pt_regs
+ * as if we faulted immediately after IRET and clear EBX so that
+ * error_exit knows that we will be returning to user mode.
+ */
+ mov %rsp, %rdi
+ call fixup_bad_iret
+ mov %rax, %rsp
+ decl %ebx
+ jmp error_entry_done
#endif
- CFI_ENDPROC
END(error_entry)
ENTRY(error_exit)
- DEFAULT_FRAME
RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- testb $3,CS(%rsp)
- jz retint_kernel
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- movl $_TIF_WORK_MASK,%edi
- andl %edi,%edx
- jnz retint_careful
- jmp restore_c_regs_and_iret
- CFI_ENDPROC
+ testb $3, CS(%rsp)
+ jz retint_kernel
+ jmp retint_user
END(error_exit)
-
#define extern #
#include <asm-generic/percpu.h>
@@ -1055,38 +907,29 @@ in_NMI: .byte 0
.popsection
do_nmi_callback:
- CFI_STARTPROC
- addq $8, %rsp
- CFI_ENDPROC
- DEFAULT_FRAME
- orb $1, PER_CPU_VAR(in_NMI)
- js 1f
+ addq $8, %rsp
+ orb $1, PER_CPU_VAR(in_NMI)
+ js 1f
0:
- movb $0x80, PER_CPU_VAR(in_NMI)
- call do_nmi
- movl $0x80, %eax
+ movb $0x80, PER_CPU_VAR(in_NMI)
+ call do_nmi
+ movl $0x80, %eax
cmpxchgb %ah, PER_CPU_VAR(in_NMI)
- jne 0b
- orl $NMI_MASK,EFLAGS(%rsp)
+ jne 0b
+ orl $NMI_MASK, EFLAGS(%rsp)
1:
RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
- jmp restore_c_regs_and_iret
- CFI_ENDPROC
+ jmp restore_c_regs_and_iret
END(do_nmi_callback)
-
#ifndef CONFIG_IA32_EMULATION
ENTRY(ignore_sysret)
- INTR_FRAME
- popq_cfi_reg rcx
- popq_cfi_reg r11
- mov $-ENOSYS,%eax
- # any non-zero value not having VGCF_in_syscall set will do:
+ popq %rcx
+ popq %r11
+ mov $-ENOSYS, %eax
HYPERVISOR_IRET VGCF_i387_valid
- CFI_ENDPROC
END(ignore_sysret)
#endif
-
--- a/arch/x86/entry/entry_64_compat-xen.S
+++ b/arch/x86/entry/entry_64_compat-xen.S
@@ -1,16 +1,14 @@
/*
- * Compatibility mode system call entry point for x86-64.
- *
+ * Compatibility mode system call entry point for x86-64.
+ *
* Copyright 2000-2002 Andi Kleen, SuSE Labs.
- */
-
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
+ */
+#include "calling.h"
#include <asm/asm-offsets.h>
#include <asm/current.h>
#include <asm/errno.h>
-#include <asm/ia32_unistd.h>
-#include <asm/thread_info.h>
+#include <asm/ia32_unistd.h>
+#include <asm/thread_info.h>
#include <asm/segment.h>
#include <asm/irqflags.h>
#include <asm/asm.h>
@@ -21,67 +19,19 @@
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE 0x40000000
+#define __AUDIT_ARCH_LE 0x40000000
.section .entry.text, "ax"
- /* clobbers %rax */
- .macro CLEAR_RREGS _r9=rax
- xorl %eax,%eax
- movq %rax,R11(%rsp)
- movq %rax,R10(%rsp)
- movq %\_r9,R9(%rsp)
- movq %rax,R8(%rsp)
- .endm
-
- /*
- * Reload arg registers from stack in case ptrace changed them.
- * We don't reload %eax because syscall_trace_enter() returned
- * the %rax value we should see. Instead, we just truncate that
- * value to 32 bits again as we did on entry from user mode.
- * If it's a new value set by user_regset during entry tracing,
- * this matches the normal truncation of the user-mode value.
- * If it's -1 to make us punt the syscall, then (u32)-1 is still
- * an appropriately invalid value.
- */
- .macro LOAD_ARGS32 _r9=0
- .if \_r9
- movl R9(%rsp),%r9d
- .endif
- movl RCX(%rsp),%ecx
- movl RDX(%rsp),%edx
- movl RSI(%rsp),%esi
- movl RDI(%rsp),%edi
- movl %eax,%eax /* zero extension */
- .endm
-
- .macro CFI_STARTPROC32 simple
- CFI_STARTPROC \simple
- CFI_UNDEFINED r8
- CFI_UNDEFINED r9
- CFI_UNDEFINED r10
- CFI_UNDEFINED r11
- CFI_UNDEFINED r12
- CFI_UNDEFINED r13
- CFI_UNDEFINED r14
- CFI_UNDEFINED r15
- .endm
-
#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret32)
swapgs
sysretl
ENDPROC(native_usergs_sysret32)
-
-ENTRY(native_irq_enable_sysexit)
- swapgs
- sti
- sysexit
-ENDPROC(native_irq_enable_sysexit)
#endif
/*
- * 32bit SYSENTER instruction entry.
+ * 32-bit SYSENTER instruction entry.
*
* SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
* IF and VM in rflags are cleared (IOW: interrupts are off).
@@ -102,94 +52,104 @@ ENDPROC(native_irq_enable_sysexit)
* path below. We set up a complete hardware stack frame to share code
* with the int 0x80 path.
*/
-ENTRY(ia32_sysenter_target)
- CFI_STARTPROC32 simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-RIP+16
- /*CFI_REL_OFFSET ss,SS-RIP+16*/
- CFI_REL_OFFSET rsp,RSP-RIP+16
- /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
- /*CFI_REL_OFFSET cs,CS-RIP+16*/
- CFI_REL_OFFSET rip,RIP-RIP+16
- CFI_REL_OFFSET r11,8
- CFI_REL_OFFSET rcx,0
+ENTRY(entry_SYSENTER_compat)
movq 8(%rsp),%r11
- CFI_RESTORE r11
- popq_cfi %rcx
- CFI_RESTORE rcx
+ popq %rcx
/* Zero-extending 32-bit regs, do not remove */
movl %ebp, %ebp
movl %eax, %eax
movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 6*8), %r10d
- CFI_REGISTER rip,r10
movl $__USER32_DS,40(%rsp)
movq %rbp,32(%rsp)
movl $__USER32_CS,16(%rsp)
movq %r10,8(%rsp)
/* Construct struct pt_regs on stack */
- movq %rax,(%rsp) /* pt_regs->orig_ax */
- pushq_cfi_reg rdi /* pt_regs->di */
- pushq_cfi_reg rsi /* pt_regs->si */
- pushq_cfi_reg rdx /* pt_regs->dx */
- pushq_cfi_reg rcx /* pt_regs->cx */
- pushq_cfi_reg rax /* pt_regs->ax */
+ movq %rax,(%rsp) /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq $-ENOSYS /* pt_regs->ax */
cld
- sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
- CFI_ADJUST_CFA_OFFSET 10*8
+ sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */
/*
* no need to do an access_ok check here because rbp has been
- * 32bit zero extended
+ * 32-bit zero extended
*/
ASM_STAC
-1: movl (%rbp),%ebp
- _ASM_EXTABLE(1b,ia32_badarg)
+1: movl (%rbp), %ebp
+ _ASM_EXTABLE(1b, ia32_badarg)
ASM_CLAC
orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz sysenter_tracesys
- jmp .Lia32_check_call
+ jnz sysenter_tracesys
+ jmp ia32_do_call
#ifdef CONFIG_AUDITSYSCALL
.macro auditsys_entry_common
- movl %esi,%r8d /* 5th arg: 4th syscall arg */
- movl %ecx,%r9d /*swap with edx*/
- movl %edx,%ecx /* 4th arg: 3rd syscall arg */
- movl %r9d,%edx /* 3rd arg: 2nd syscall arg */
- movl %ebx,%esi /* 2nd arg: 1st syscall arg */
- movl %eax,%edi /* 1st arg: syscall number */
- call __audit_syscall_entry
- movl RAX(%rsp),%eax /* reload syscall number */
- cmpq $(IA32_NR_syscalls-1),%rax
- ja ia32_badsys
- movl %ebx,%edi /* reload 1st syscall arg */
- movl RCX(%rsp),%esi /* reload 2nd syscall arg */
- movl RDX(%rsp),%edx /* reload 3rd syscall arg */
- movl RSI(%rsp),%ecx /* reload 4th syscall arg */
- movl RDI(%rsp),%r8d /* reload 5th syscall arg */
+ /*
+ * At this point, registers hold syscall args in the 32-bit syscall ABI:
+ * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP.
+ *
+ * We want to pass them to __audit_syscall_entry(), which is a 64-bit
+ * C function with 5 parameters, so shuffle them to match what
+ * the function expects: RDI,RSI,RDX,RCX,R8.
+ */
+ movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */
+ xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */
+ /* arg3 (RDX) <= 2nd syscall arg (ECX) */
+ movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */
+ movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */
+ call __audit_syscall_entry
+
+ /*
+ * We are going to jump back to the syscall dispatch code.
+ * Prepare syscall args as required by the 64-bit C ABI.
+ * Registers clobbered by __audit_syscall_entry() are
+ * loaded from pt_regs on stack:
+ */
+ movl ORIG_RAX(%rsp), %eax /* syscall number */
+ movl %ebx, %edi /* arg1 */
+ movl RCX(%rsp), %esi /* arg2 */
+ movl RDX(%rsp), %edx /* arg3 */
+ movl RSI(%rsp), %ecx /* arg4 */
+ movl RDI(%rsp), %r8d /* arg5 */
.endm
sysenter_auditsys:
auditsys_entry_common
- movl %ebp,%r9d /* reload 6th syscall arg */
- jmp .Lia32_dispatch
+ movl %ebp, %r9d /* reload 6th syscall arg */
+ jmp .Lia32_dispatch
+#endif
+
+sysenter_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jz sysenter_auditsys
#endif
- CFI_ENDPROC
-ENDPROC(ia32_sysenter_target)
+ SAVE_EXTRA_REGS
+ xorl %eax, %eax /* Do not leak kernel information */
+ movq %rax, R11(%rsp)
+ movq %rax, R10(%rsp)
+ movq %rax, R9(%rsp)
+ movq %rax, R8(%rsp)
+ jmp .Ltracesys
+ENDPROC(entry_SYSENTER_compat)
/*
- * 32bit SYSCALL instruction entry.
+ * 32-bit SYSCALL instruction entry.
*
- * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
* are not needed). SYSCALL does not save anything on the stack
* and does not change rsp.
*
* Note: rflags saving+masking-with-MSR happens only in Long mode
- * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
+ * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
* Don't get confused: rflags saving+masking depends on Long Mode Active bit
* (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
* or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
@@ -209,84 +169,86 @@ ENDPROC(ia32_sysenter_target)
* path below. We set up a complete hardware stack frame to share code
* with the int 0x80 path.
*/
-ENTRY(ia32_cstar_target)
- CFI_STARTPROC32 simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-RIP+16
- /*CFI_REL_OFFSET ss,SS-RIP+16*/
- CFI_REL_OFFSET rsp,RSP-RIP+16
- /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
- /*CFI_REL_OFFSET cs,CS-RIP+16*/
- CFI_REL_OFFSET rip,RIP-RIP+16
- movl RSP-RIP+16(%rsp),%r8d
+ENTRY(entry_SYSCALL_compat)
+ movl RSP-RIP+16(%rsp), %r8d
/* Zero-extending 32-bit regs, do not remove */
- movl %eax,%eax
+ movl %eax, %eax
/* Construct struct pt_regs on stack */
- movl $__USER32_DS,6*8(%rsp) /* pt_regs->ss */
- movl $__USER32_CS,3*8(%rsp) /* pt_regs->cs */
- movq_cfi rax,8 /* pt_regs->orig_ax */
- movq_cfi rdi,0 /* pt_regs->di */
- pushq_cfi_reg rsi /* pt_regs->si */
- pushq_cfi_reg rdx /* pt_regs->dx */
- pushq_cfi_reg rbp /* pt_regs->cx */
- movl %ebp,%ecx
- pushq_cfi %rax /* pt_regs->ax */
- sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
- CFI_ADJUST_CFA_OFFSET 10*8
+ movl $__USER32_DS,6*8(%rsp) /* pt_regs->ss */
+ movl $__USER32_CS,3*8(%rsp) /* pt_regs->cs */
+ movq %rax, 8(%rsp) /* pt_regs->orig_ax */
+ movq %rdi, 0(%rsp) /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rbp /* pt_regs->cx */
+ movl %ebp, %ecx
+ pushq $-ENOSYS /* pt_regs->ax */
+ sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */
/*
- * no need to do an access_ok check here because r8 has been
- * 32bit zero extended
+ * No need to do an access_ok check here because r8 has been
+ * 32-bit zero extended:
*/
ASM_STAC
-1: movl (%r8),%r9d
- _ASM_EXTABLE(1b,ia32_badarg)
+1: movl (%r8), %r9d
+ _ASM_EXTABLE(1b, ia32_badarg)
ASM_CLAC
- orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz cstar_tracesys
- cmpq $IA32_NR_syscalls-1,%rax
- ja ia32_badsys
+ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz cstar_tracesys
cstar_do_call:
- /* 32bit syscall -> 64bit C ABI argument conversion */
- movl %edi,%r8d /* arg5 */
- /* r9 already loaded */ /* arg6 */
+ /* 32-bit syscall -> 64-bit C ABI argument conversion */
+ movl %edi, %r8d /* arg5 */
+ /* r9 already loaded */ /* arg6 */
jmp .Lia32_arg_fixup_common
-
+
#ifdef CONFIG_AUDITSYSCALL
cstar_auditsys:
- movl %r9d,R9(%rsp) /* register to be clobbered by call */
+ movl %r9d, R9(%rsp) /* register to be clobbered by call */
auditsys_entry_common
- movl R9(%rsp),%r9d /* reload 6th syscall arg */
- jmp .Lia32_dispatch
+ movl R9(%rsp), %r9d /* reload 6th syscall arg */
+ jmp .Lia32_dispatch
#endif
cstar_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jz cstar_auditsys
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jz cstar_auditsys
#endif
- xchgl %r9d,%ebp
+ xchgl %r9d, %ebp
SAVE_EXTRA_REGS
- CLEAR_RREGS r9
- movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
- movq %rsp,%rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
- LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */
+ xorl %eax, %eax /* Do not leak kernel information */
+ movq %rax, R11(%rsp)
+ movq %rax, R10(%rsp)
+ movq %r9, R9(%rsp)
+ movq %rax, R8(%rsp)
+ movq %rsp, %rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+ movl R9(%rsp), %r9d
+
+ /* Reload arg registers from stack. (see sysenter_tracesys) */
+ movl RCX(%rsp), %ecx
+ movl RDX(%rsp), %edx
+ movl RSI(%rsp), %esi
+ movl RDI(%rsp), %edi
+ movl %eax, %eax /* zero extension */
+
RESTORE_EXTRA_REGS
- xchgl %ebp,%r9d
- cmpq $(IA32_NR_syscalls-1),%rax
- ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
- jmp cstar_do_call
-END(ia32_cstar_target)
-
+ xchgl %ebp, %r9d
+ jmp cstar_do_call
+END(entry_SYSCALL_compat)
+
ia32_badarg:
ASM_CLAC
- movq $-EFAULT,%rax
- jmp ia32_sysret
- CFI_ENDPROC
+ movq $-EFAULT, RAX(%rsp)
+ xorl %eax, %eax /* Do not leak kernel information */
+ movq %rax, R11(%rsp)
+ movq %rax, R10(%rsp)
+ movq %rax, R9(%rsp)
+ movq %rax, R8(%rsp)
+ jmp int_ret_from_sys_call
/*
* Emulated IA32 system calls via int 0x80.
@@ -309,121 +271,98 @@ ia32_badarg:
* Assumes it is only called from user space and entered with interrupts on.
*/
-ENTRY(ia32_syscall)
- CFI_STARTPROC32 simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,5*8+16
- /*CFI_REL_OFFSET ss,4*8+16 */
- CFI_REL_OFFSET rsp,3*8+16
- /*CFI_REL_OFFSET rflags,2*8+16 */
- /*CFI_REL_OFFSET cs,1*8+16 */
- CFI_REL_OFFSET rip,0*8+16
- CFI_REL_OFFSET r11,8
- CFI_REL_OFFSET rcx,0
- movq 8(%rsp),%r11
- CFI_RESTORE r11
- popq_cfi %rcx
- CFI_RESTORE rcx
+ENTRY(entry_INT80_compat)
+ movq 8(%rsp),%r11
+ popq %rcx
/* Zero-extending 32-bit regs, do not remove */
- movl %eax,%eax
+ movl %eax, %eax
/* Construct struct pt_regs on stack (iret frame is already on stack) */
- movq %rax,(%rsp) /* pt_regs->orig_ax */
- pushq_cfi_reg rdi /* pt_regs->di */
- pushq_cfi_reg rsi /* pt_regs->si */
- pushq_cfi_reg rdx /* pt_regs->dx */
- pushq_cfi_reg rcx /* pt_regs->cx */
- pushq_cfi_reg rax /* pt_regs->ax */
+ movq %rax,(%rsp) /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq $-ENOSYS /* pt_regs->ax */
+ pushq $0 /* pt_regs->r8 */
+ pushq $0 /* pt_regs->r9 */
+ pushq $0 /* pt_regs->r10 */
+ pushq $0 /* pt_regs->r11 */
cld
- sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
- CFI_ADJUST_CFA_OFFSET 10*8
+ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
+
+ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz ia32_tracesys
- orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz ia32_tracesys
-.Lia32_check_call:
- cmpq $(IA32_NR_syscalls-1),%rax
- ja ia32_badsys
ia32_do_call:
- /* 32bit syscall -> 64bit C ABI argument conversion */
- movl %edi,%r8d /* arg5 */
- movl %ebp,%r9d /* arg6 */
+ /* 32-bit syscall -> 64-bit C ABI argument conversion */
+ movl %edi, %r8d /* arg5 */
+ movl %ebp, %r9d /* arg6 */
.Lia32_arg_fixup_common:
- xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
- movl %ebx,%edi /* arg1 */
- movl %edx,%edx /* arg3 (zero extension) */
+ xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx, %edi /* arg1 */
+ movl %edx, %edx /* arg3 (zero extension) */
.Lia32_dispatch:
- call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
-ia32_sysret:
- movq %rax,RAX(%rsp)
- CLEAR_RREGS
- jmp int_ret_from_sys_call
+ cmpq $(IA32_NR_syscalls-1), %rax
+ ja 1f
+
+ call *ia32_sys_call_table(, %rax, 8)
+ movq %rax, RAX(%rsp)
+1:
+ jmp int_ret_from_sys_call
-sysenter_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jz sysenter_auditsys
-#endif
ia32_tracesys:
SAVE_EXTRA_REGS
- CLEAR_RREGS
- movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
- movq %rsp,%rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
- LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
+.Ltracesys:
+ movq %rsp, %rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+ /*
+ * Reload arg registers from stack in case ptrace changed them.
+ * Don't reload %eax because syscall_trace_enter() returned
+ * the %rax value we should see. But do truncate it to 32 bits.
+ * If it's -1 to make us punt the syscall, then (u32)-1 is still
+ * an appropriately invalid value.
+ */
+ movl RCX(%rsp), %ecx
+ movl RDX(%rsp), %edx
+ movl RSI(%rsp), %esi
+ movl RDI(%rsp), %edi
+ movl %eax, %eax /* zero extension */
RESTORE_EXTRA_REGS
- cmpq $(IA32_NR_syscalls-1),%rax
- ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
- jmp ia32_do_call
-END(ia32_syscall)
-
-ia32_badsys:
- movq $0,ORIG_RAX(%rsp)
- movq $-ENOSYS,%rax
- jmp ia32_sysret
+ jmp ia32_do_call
+END(entry_INT80_compat)
- CFI_ENDPROC
-
.macro PTREGSCALL label, func
ALIGN
GLOBAL(\label)
- leaq \func(%rip),%rax
- jmp ia32_ptregs_common
+ leaq \func(%rip), %rax
+ jmp ia32_ptregs_common
.endm
- CFI_STARTPROC32
-
- PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
- PTREGSCALL stub32_sigreturn, sys32_sigreturn
- PTREGSCALL stub32_fork, sys_fork
- PTREGSCALL stub32_vfork, sys_vfork
+ PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
+ PTREGSCALL stub32_sigreturn, sys32_sigreturn
+ PTREGSCALL stub32_fork, sys_fork
+ PTREGSCALL stub32_vfork, sys_vfork
ALIGN
GLOBAL(stub32_clone)
- leaq sys_clone(%rip),%rax
- mov %r8, %rcx
- jmp ia32_ptregs_common
+ leaq sys_clone(%rip), %rax
+ /*
+ * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
+ * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
+ *
+ * The native 64-bit kernel's sys_clone() implements the latter,
+ * so we need to swap arguments here before calling it:
+ */
+ xchg %r8, %rcx
+ jmp ia32_ptregs_common
ALIGN
ia32_ptregs_common:
- CFI_ENDPROC
- CFI_STARTPROC32 simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SIZEOF_PTREGS
- CFI_REL_OFFSET rax,RAX
- CFI_REL_OFFSET rcx,RCX
- CFI_REL_OFFSET rdx,RDX
- CFI_REL_OFFSET rsi,RSI
- CFI_REL_OFFSET rdi,RDI
- CFI_REL_OFFSET rip,RIP
-/* CFI_REL_OFFSET cs,CS*/
-/* CFI_REL_OFFSET rflags,EFLAGS*/
- CFI_REL_OFFSET rsp,RSP
-/* CFI_REL_OFFSET ss,SS*/
SAVE_EXTRA_REGS 8
- call *%rax
+ call *%rax
RESTORE_EXTRA_REGS 8
ret
- CFI_ENDPROC
END(ia32_ptregs_common)
--- a/arch/x86/entry/syscall_32-xen.c
+++ b/arch/x86/entry/syscall_32-xen.c
@@ -9,12 +9,12 @@ extern asmlinkage void cstar_set_tif(voi
#define ptregs_clone cstar_set_tif
#define ptregs_vfork cstar_set_tif
-__visible const sys_call_ptr_t cstar_call_table[__NR_ia32_syscall_max+1] = {
+__visible const sys_call_ptr_t cstar_call_table[__NR_syscall_compat_max+1] = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
*/
- [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
+ [0 ... __NR_syscall_compat_max] = &sys_ni_syscall,
#include <asm/syscalls_32.h>
};
#endif /* TIF_CSTAR */
--- a/arch/x86/entry/vdso/vdso32-setup-xen.c
+++ b/arch/x86/entry/vdso/vdso32-setup-xen.c
@@ -75,11 +75,11 @@ int __init sysenter_setup(void)
#ifdef CONFIG_X86_32
if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
# ifdef TIF_CSTAR
- extern asmlinkage void ia32pv_cstar_target(void);
+ extern asmlinkage void entry_SYSCALL_PV32(void);
static const struct callback_register __initconst cstar = {
.type = CALLBACKTYPE_syscall32,
.address = { __KERNEL_CS,
- (unsigned long)ia32pv_cstar_target },
+ (unsigned long)entry_SYSCALL_PV32 },
};
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
&& HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -10,7 +10,11 @@ enum {
X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1,
};
+#ifndef CONFIG_XEN
extern struct irq_domain *x86_vector_domain;
+#else
+#define x86_vector_domain xen_irq_domain
+#endif
extern void init_irq_alloc_info(struct irq_alloc_info *info,
const struct cpumask *mask);
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -191,7 +191,11 @@ static inline unsigned long current_stac
#else /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_64
+#ifndef CONFIG_XEN
# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
+#else
+# define cpu_current_top_of_stack cpu_sp0
+# endif
#endif
/* Load thread_info address into "reg" */
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -25,14 +25,8 @@
#ifndef _ASM_X86_TOPOLOGY_H
#define _ASM_X86_TOPOLOGY_H
-#ifdef CONFIG_X86_32
-# ifdef CONFIG_SMP
-# define ENABLE_TOPO_DEFINES
-# endif
-#else
-# if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
-# define ENABLE_TOPO_DEFINES
-# endif
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+# define ENABLE_TOPO_DEFINES
#endif
/*
--- a/arch/x86/include/mach-xen/asm/desc.h
+++ b/arch/x86/include/mach-xen/asm/desc.h
@@ -304,21 +304,6 @@ static inline void clear_LDT(void)
set_ldt(NULL, 0);
}
-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT_nolock(mm_context_t *pc)
-{
- set_ldt(pc->ldt, pc->size);
-}
-
-static inline void load_LDT(mm_context_t *pc)
-{
- preempt_disable();
- load_LDT_nolock(pc);
- preempt_enable();
-}
-
static inline unsigned long get_desc_base(const struct desc_struct *desc)
{
return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
--- a/arch/x86/include/mach-xen/asm/fpu/internal.h
+++ b/arch/x86/include/mach-xen/asm/fpu/internal.h
@@ -1,26 +1,33 @@
-#ifndef _FPU_INTERNAL_H
-#include <asm/i387.h>
+#ifndef _ASM_X86_FPU_INTERNAL_H
#define switch_fpu_prepare native_switch_fpu_prepare
-#include_next <asm/fpu-internal.h>
+#include_next <asm/fpu/internal.h>
#undef switch_fpu_prepare
-static inline bool xen_thread_fpu_begin(struct task_struct *tsk,
- multicall_entry_t *mcl)
+static inline void xen_fpregs_activate_hw(multicall_entry_t **mcl)
{
- bool ret = false;
+ if (!use_eager_fpu()) {
+ (*mcl)->op = __HYPERVISOR_fpu_taskswitch;
+ (*mcl)++->args[0] = 0;
+ }
+}
- if (mcl && !use_eager_fpu()) {
- mcl->op = __HYPERVISOR_fpu_taskswitch;
- mcl->args[0] = 0;
- ret = true;
+static inline void xen_fpregs_deactivate_hw(multicall_entry_t **mcl)
+{
+ if (!use_eager_fpu()) {
+ (*mcl)->op = __HYPERVISOR_fpu_taskswitch;
+ (*mcl)++->args[0] = 1;
}
- __thread_set_has_fpu(tsk);
+}
- return ret;
+static inline void xen_fpregs_activate(struct fpu *fpu,
+ multicall_entry_t **mcl)
+{
+ xen_fpregs_activate_hw(mcl);
+ __fpregs_activate(fpu);
}
-static inline fpu_switch_t xen_switch_fpu_prepare(struct task_struct *old,
- struct task_struct *new,
+static inline fpu_switch_t xen_switch_fpu_prepare(struct fpu *old_fpu,
+ struct fpu *new_fpu,
int cpu,
multicall_entry_t **mcl)
{
@@ -30,38 +37,36 @@ static inline fpu_switch_t xen_switch_fp
* If the task has used the math, pre-load the FPU on xsave processors
* or if the past 5 consecutive context-switches used math.
*/
- fpu.preload = tsk_used_math(new) &&
- (use_eager_fpu() || new->thread.fpu_counter > 5);
+ fpu.preload = new_fpu->fpstate_active &&
+ (use_eager_fpu() || new_fpu->counter > 5);
- if (__thread_has_fpu(old)) {
- if (!__save_init_fpu(old))
- task_disable_lazy_fpu_restore(old);
+ if (old_fpu->fpregs_active) {
+ if (!copy_fpregs_to_fpstate(old_fpu))
+ old_fpu->last_cpu = -1;
else
- old->thread.fpu.last_cpu = cpu;
+ old_fpu->last_cpu = cpu;
- /* But leave fpu_owner_task! */
- old->thread.fpu.has_fpu = 0;
+ /* But leave fpu_fpregs_owner_ctx! */
+ old_fpu->fpregs_active = 0;
/* Don't change CR0.TS if we just switch! */
if (fpu.preload) {
- new->thread.fpu_counter++;
- __thread_set_has_fpu(new);
- prefetch(new->thread.fpu.state);
- } else if (!use_eager_fpu()) {
- (*mcl)->op = __HYPERVISOR_fpu_taskswitch;
- (*mcl)++->args[0] = 1;
+ new_fpu->counter++;
+ __fpregs_activate(new_fpu);
+ prefetch(&new_fpu->state);
+ } else {
+ xen_fpregs_deactivate_hw(mcl);
}
} else {
- old->thread.fpu_counter = 0;
- task_disable_lazy_fpu_restore(old);
+ old_fpu->counter = 0;
+ old_fpu->last_cpu = -1;
if (fpu.preload) {
- new->thread.fpu_counter++;
- if (fpu_lazy_restore(new, cpu))
+ new_fpu->counter++;
+ if (fpu_want_lazy_restore(new_fpu, cpu))
fpu.preload = 0;
else
- prefetch(new->thread.fpu.state);
- if (xen_thread_fpu_begin(new, *mcl))
- ++*mcl;
+ prefetch(&new_fpu->state);
+ xen_fpregs_activate(new_fpu, mcl);
}
}
return fpu;
--- a/arch/x86/include/mach-xen/asm/io.h
+++ b/arch/x86/include/mach-xen/asm/io.h
@@ -35,11 +35,13 @@
*/
#define ARCH_HAS_IOREMAP_WC
+#define ARCH_HAS_IOREMAP_WT
#include <linux/string.h>
#include <linux/compiler.h>
#include <asm/page.h>
#include <asm/early_ioremap.h>
+#include <asm/pgtable_types.h>
#ifdef __KERNEL__
#include <asm/fixmap.h>
#endif
@@ -183,6 +185,7 @@ static inline void *phys_to_virt(phys_ad
* look at pci_iomap().
*/
extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
unsigned long prot_val);
@@ -203,8 +206,6 @@ extern void set_iounmap_nonlazy(void);
#include <asm-generic/iomap.h>
-#include <linux/vmalloc.h>
-
/*
* Convert a virtual cached pointer to an uncached pointer
*/
@@ -243,6 +244,12 @@ static inline void flush_write_buffers(v
#endif
}
+static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
+ unsigned long size)
+{
+ return (void __force __pmem *) ioremap_cache(offset, size);
+}
+
#endif /* __KERNEL__ */
extern void native_io_delay(void);
@@ -325,12 +332,16 @@ extern void unxlate_dev_mem_ptr(phys_add
extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
enum page_cache_mode pcm);
extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
extern bool is_early_ioremap_ptep(pte_t *ptep);
#define IO_SPACE_LIMIT 0xffff
#ifdef CONFIG_MTRR
+extern int __must_check arch_phys_wc_index(int handle);
+#define arch_phys_wc_index arch_phys_wc_index
+
extern int __must_check arch_phys_wc_add(unsigned long base,
unsigned long size);
extern void arch_phys_wc_del(int handle);
--- a/arch/x86/include/mach-xen/asm/irq_vectors.h
+++ b/arch/x86/include/mach-xen/asm/irq_vectors.h
@@ -4,9 +4,6 @@
#define MCE_VECTOR 0x12
#define IA32_SYSCALL_VECTOR 0x80
-#ifdef CONFIG_X86_32
-# define SYSCALL_VECTOR 0x80
-#endif
#define RESCHEDULE_VECTOR 0
#define CALL_FUNCTION_VECTOR 1
@@ -48,7 +45,7 @@ static inline int invalid_vm86_irq(int i
* static arrays.
*/
-#define NR_IRQS_LEGACY 16
+#define NR_IRQS_LEGACY 16
/*
* The flat IRQ space is divided into two regions:
@@ -67,15 +64,17 @@ static inline int invalid_vm86_irq(int i
#define IO_APIC_VECTOR_LIMIT PIRQ_MAX(32 * MAX_IO_APICS)
#define CPU_VECTOR_LIMIT PIRQ_MAX(64 * NR_CPUS)
-#if defined(CONFIG_X86_IO_APIC)
-# define NR_PIRQS \
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI)
+#define NR_PIRQS \
(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
(NR_VECTORS + CPU_VECTOR_LIMIT) : \
(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
-#elif defined(CONFIG_XEN_PCIDEV_FRONTEND)
-# define NR_PIRQS (NR_VECTORS + CPU_VECTOR_LIMIT)
-#else /* !CONFIG_X86_IO_APIC: */
-# define NR_PIRQS NR_IRQS_LEGACY
+#elif defined(CONFIG_X86_IO_APIC)
+#define NR_PIRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT)
+#elif defined(CONFIG_PCI_MSI)
+#define NR_PIRQS (NR_VECTORS + CPU_VECTOR_LIMIT)
+#else
+#define NR_PIRQS NR_IRQS_LEGACY
#endif
#ifndef __ASSEMBLY__
--- a/arch/x86/include/mach-xen/asm/mmu_context.h
+++ b/arch/x86/include/mach-xen/asm/mmu_context.h
@@ -30,7 +30,7 @@ extern struct static_key rdpmc_always_av
static inline void load_mm_cr4(struct mm_struct *mm)
{
- if (static_key_true(&rdpmc_always_available) ||
+ if (static_key_false(&rdpmc_always_available) ||
atomic_read(&mm->context.perf_rdpmc_allowed))
cr4_set_bits(X86_CR4_PCE);
else
@@ -41,6 +41,50 @@ static inline void load_mm_cr4(struct mm
#endif
/*
+ * ldt_structs can be allocated, used, and freed, but they are never
+ * modified while live.
+ */
+struct ldt_struct {
+ /*
+ * Xen requires page-aligned LDTs with special permissions. This is
+ * needed to prevent us from installing evil descriptors such as
+ * call gates. On native, we could merge the ldt_struct and LDT
+ * allocations, but it's not worth trying to optimize.
+ */
+ struct desc_struct *entries;
+ int size;
+};
+
+static inline void load_mm_ldt(struct mm_struct *mm)
+{
+ struct ldt_struct *ldt;
+
+ /* lockless_dereference synchronizes with smp_store_release */
+ ldt = lockless_dereference(mm->context.ldt);
+
+ /*
+ * Any change to mm->context.ldt is followed by an IPI to all
+ * CPUs with the mm active. The LDT will not be freed until
+ * after the IPI is handled by all such CPUs. This means that,
+ * if the ldt_struct changes before we return, the values we see
+ * will be safe, and the new values will be loaded before we run
+ * any user code.
+ *
+ * NB: don't try to convert this to use RCU without extreme care.
+ * We would still need IRQs off, because we don't want to change
+ * the local LDT after an IPI loaded a newer value than the one
+ * that we can see.
+ */
+
+ if (unlikely(ldt))
+ set_ldt(ldt->entries, ldt->size);
+ else
+ clear_LDT();
+
+ DEBUG_LOCKS_WARN_ON(preemptible());
+}
+
+/*
* Used for LDT copy/destruction.
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
@@ -137,15 +181,24 @@ static inline void switch_mm(struct mm_s
* was called and then modify_ldt changed
* prev->context.ldt but suppressed an IPI to this CPU.
* In this case, prev->context.ldt != NULL, because we
- * never free an LDT while the mm still exists. That
- * means that next->context.ldt != prev->context.ldt,
- * because mms never share an LDT.
+ * never set context.ldt to NULL while the mm still
+ * exists. That means that next->context.ldt !=
+ * prev->context.ldt, because mms never share an LDT.
*/
if (unlikely(prev->context.ldt != next->context.ldt)) {
- /* load_LDT_nolock(&next->context) */
+ /* load_mm_ldt(next) */
+ const struct ldt_struct *ldt;
+
+ /* lockless_dereference synchronizes with smp_store_release */
+ ldt = lockless_dereference(next->context.ldt);
op->cmd = MMUEXT_SET_LDT;
- op->arg1.linear_addr = (unsigned long)next->context.ldt;
- op->arg2.nr_ents = next->context.size;
+ if (unlikely(ldt)) {
+ op->arg1.linear_addr = (long)ldt->entries;
+ op->arg2.nr_ents = ldt->size;
+ } else {
+ op->arg1.linear_addr = 0;
+ op->arg2.nr_ents = 0;
+ }
op++;
}
@@ -176,7 +229,7 @@ static inline void switch_mm(struct mm_s
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
load_mm_cr4(next);
xen_new_user_pt(__pa(__user_pgd(next->pgd)));
- load_LDT_nolock(&next->context);
+ load_mm_ldt(next);
}
}
#endif
@@ -214,6 +267,19 @@ static inline void arch_exit_mmap(struct
}
#endif
+#ifdef CONFIG_X86_64
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+ return !config_enabled(CONFIG_IA32_EMULATION) ||
+ !(mm->context.ia32_compat == TIF_IA32);
+}
+#else
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+ return false;
+}
+#endif
+
static inline void arch_bprm_mm_init(struct mm_struct *mm,
struct vm_area_struct *vma)
{
--- a/arch/x86/include/mach-xen/asm/pci.h
+++ b/arch/x86/include/mach-xen/asm/pci.h
@@ -5,7 +5,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
#include <asm/io.h>
#include <asm/x86_init.h>
@@ -86,13 +86,6 @@ extern int pci_mmap_page_range(struct pc
#ifdef CONFIG_PCI
extern void early_quirks(void);
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
- enum pci_dma_burst_strategy *strat,
- unsigned long *strategy_parameter)
-{
- *strat = PCI_DMA_BURST_INFINITY;
- *strategy_parameter = ~0UL;
-}
#else
static inline void early_quirks(void) { }
#endif
@@ -102,15 +95,10 @@ extern void pci_iommu_alloc(void);
#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN)
/* implemented in arch/x86/kernel/apic/io_apic. */
struct msi_desc;
-void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq,
- unsigned int dest, struct msi_msg *msg, u8 hpet_id);
int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
void native_teardown_msi_irq(unsigned int irq);
void native_restore_msi_irqs(struct pci_dev *dev);
-int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
- unsigned int irq_base, unsigned int irq_offset);
#else
-#define native_compose_msi_msg NULL
#define native_setup_msi_irqs NULL
#define native_teardown_msi_irq NULL
#endif
--- a/arch/x86/include/mach-xen/asm/pgtable.h
+++ b/arch/x86/include/mach-xen/asm/pgtable.h
@@ -398,11 +398,17 @@ static inline int is_new_memtype_allowed
* requested memtype:
* - request is uncached, return cannot be write-back
* - request is write-combine, return cannot be write-back
+ * - request is write-through, return cannot be write-back
+ * - request is write-through, return cannot be write-combine
*/
if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
new_pcm == _PAGE_CACHE_MODE_WB) ||
(pcm == _PAGE_CACHE_MODE_WC &&
- new_pcm == _PAGE_CACHE_MODE_WB)) {
+ new_pcm == _PAGE_CACHE_MODE_WB) ||
+ (pcm == _PAGE_CACHE_MODE_WT &&
+ new_pcm == _PAGE_CACHE_MODE_WB) ||
+ (pcm == _PAGE_CACHE_MODE_WT &&
+ new_pcm == _PAGE_CACHE_MODE_WC)) {
return 0;
}
@@ -840,9 +846,9 @@ static inline int pmd_write(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_RW;
}
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
pmd_t pmd = xen_pmdp_get_and_clear(pmdp);
--- a/arch/x86/include/mach-xen/asm/pgtable_types.h
+++ b/arch/x86/include/mach-xen/asm/pgtable_types.h
@@ -419,6 +419,9 @@ extern int nx_enabled;
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);
+#define pgprot_writethrough pgprot_writethrough
+extern pgprot_t pgprot_writethrough(pgprot_t prot);
+
#ifndef CONFIG_XEN
/* Indicate that x86 has its own track and untrack pfn vma functions */
#define __HAVE_PFNMAP_TRACKING
--- a/arch/x86/include/mach-xen/asm/processor.h
+++ b/arch/x86/include/mach-xen/asm/processor.h
@@ -21,6 +21,7 @@ struct mm_struct;
#include <asm/desc_defs.h>
#include <asm/nops.h>
#include <asm/special_insns.h>
+#include <asm/fpu/types.h>
#include <linux/personality.h>
#include <linux/cpumask.h>
@@ -54,11 +55,16 @@ static inline void *current_text_addr(vo
return pc;
}
+/*
+ * These alignment constraints are for performance in the vSMP case,
+ * but in the task_struct case we must also meet hardware imposed
+ * alignment requirements of the FPU state:
+ */
#ifdef CONFIG_X86_VSMP
# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
#else
-# define ARCH_MIN_TASKALIGN 16
+# define ARCH_MIN_TASKALIGN __alignof__(union fpregs_state)
# define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
@@ -177,7 +183,6 @@ extern const struct seq_operations cpuin
#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
extern void cpu_detect(struct cpuinfo_x86 *c);
-extern void fpu_detect(struct cpuinfo_x86 *c);
extern void early_cpu_init(void);
extern void identify_boot_cpu(void);
@@ -328,131 +333,9 @@ struct orig_ist {
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#else
-DECLARE_PER_CPU(unsigned long, kernel_stack);
+DECLARE_PER_CPU(unsigned long, cpu_sp0);
#endif
-#define MXCSR_DEFAULT 0x1f80
-
-struct i387_fsave_struct {
- u32 cwd; /* FPU Control Word */
- u32 swd; /* FPU Status Word */
- u32 twd; /* FPU Tag Word */
- u32 fip; /* FPU IP Offset */
- u32 fcs; /* FPU IP Selector */
- u32 foo; /* FPU Operand Pointer Offset */
- u32 fos; /* FPU Operand Pointer Selector */
-
- /* 8*10 bytes for each FP-reg = 80 bytes: */
- u32 st_space[20];
-
- /* Software status information [not touched by FSAVE ]: */
- u32 status;
-};
-
-struct i387_fxsave_struct {
- u16 cwd; /* Control Word */
- u16 swd; /* Status Word */
- u16 twd; /* Tag Word */
- u16 fop; /* Last Instruction Opcode */
- union {
- struct {
- u64 rip; /* Instruction Pointer */
- u64 rdp; /* Data Pointer */
- };
- struct {
- u32 fip; /* FPU IP Offset */
- u32 fcs; /* FPU IP Selector */
- u32 foo; /* FPU Operand Offset */
- u32 fos; /* FPU Operand Selector */
- };
- };
- u32 mxcsr; /* MXCSR Register State */
- u32 mxcsr_mask; /* MXCSR Mask */
-
- /* 8*16 bytes for each FP-reg = 128 bytes: */
- u32 st_space[32];
-
- /* 16*16 bytes for each XMM-reg = 256 bytes: */
- u32 xmm_space[64];
-
- u32 padding[12];
-
- union {
- u32 padding1[12];
- u32 sw_reserved[12];
- };
-
-} __attribute__((aligned(16)));
-
-struct i387_soft_struct {
- u32 cwd;
- u32 swd;
- u32 twd;
- u32 fip;
- u32 fcs;
- u32 foo;
- u32 fos;
- /* 8*10 bytes for each FP-reg = 80 bytes: */
- u32 st_space[20];
- u8 ftop;
- u8 changed;
- u8 lookahead;
- u8 no_update;
- u8 rm;
- u8 alimit;
- struct math_emu_info *info;
- u32 entry_eip;
-};
-
-struct ymmh_struct {
- /* 16 * 16 bytes for each YMMH-reg = 256 bytes */
- u32 ymmh_space[64];
-};
-
-/* We don't support LWP yet: */
-struct lwp_struct {
- u8 reserved[128];
-};
-
-struct bndreg {
- u64 lower_bound;
- u64 upper_bound;
-} __packed;
-
-struct bndcsr {
- u64 bndcfgu;
- u64 bndstatus;
-} __packed;
-
-struct xsave_hdr_struct {
- u64 xstate_bv;
- u64 xcomp_bv;
- u64 reserved[6];
-} __attribute__((packed));
-
-struct xsave_struct {
- struct i387_fxsave_struct i387;
- struct xsave_hdr_struct xsave_hdr;
- struct ymmh_struct ymmh;
- struct lwp_struct lwp;
- struct bndreg bndreg[4];
- struct bndcsr bndcsr;
- /* new processor state extensions will go here */
-} __attribute__ ((packed, aligned (64)));
-
-union thread_xstate {
- struct i387_fsave_struct fsave;
- struct i387_fxsave_struct fxsave;
- struct i387_soft_struct soft;
- struct xsave_struct xsave;
-};
-
-struct fpu {
- unsigned int last_cpu;
- unsigned int has_fpu;
- union thread_xstate *state;
-};
-
#ifdef CONFIG_X86_64
#ifndef CONFIG_X86_NO_TSS
DECLARE_PER_CPU(struct orig_ist, orig_ist);
@@ -503,8 +386,6 @@ DECLARE_PER_CPU(struct irq_stack *, soft
#endif /* X86_64 */
extern unsigned int xstate_size;
-extern void free_thread_xstate(struct task_struct *);
-extern struct kmem_cache *task_xstate_cachep;
struct perf_event;
@@ -528,6 +409,7 @@ struct thread_struct {
unsigned long fs;
#endif
unsigned long gs;
+
/* Save middle states of ptrace breakpoints */
struct perf_event *ptrace_bps[HBP_NUM];
/* Debug status used for traps, single steps, etc... */
@@ -538,8 +420,6 @@ struct thread_struct {
unsigned long cr2;
unsigned long trap_nr;
unsigned long error_code;
- /* floating point and extended processor state */
- struct fpu fpu;
#ifdef CONFIG_X86_32
/* Virtual 86 mode info */
struct vm86_struct __user *vm86_info;
@@ -555,15 +435,13 @@ struct thread_struct {
unsigned long iopl;
/* Max allowed port in the bitmap, in bytes: */
unsigned io_bitmap_max;
+
+ /* Floating point and extended processor state */
+ struct fpu fpu;
/*
- * fpu_counter contains the number of consecutive context switches
- * that the FPU is used. If this is over a threshold, the lazy fpu
- * saving becomes unlazy to save the trap. This is an unsigned char
- * so that after 256 times the counter wraps and the behavior turns
- * lazy again; this to deal with bursty apps that only use FPU for
- * a short time
+ * WARNING: 'fpu' is dynamically-sized. It *MUST* be at
+ * the end.
*/
- unsigned char fpu_counter;
};
/*
@@ -601,7 +479,7 @@ native_load_sp0(struct tss_struct *tss,
static inline unsigned long current_top_of_stack(void)
{
#ifdef CONFIG_X86_64
- return this_cpu_read_stable(kernel_stack);
+ return this_cpu_read_stable(cpu_sp0);
#else
/* sp0 on x86_32 is special in and around vm86 mode. */
return this_cpu_read_stable(cpu_current_top_of_stack);
@@ -931,24 +809,25 @@ extern int get_tsc_mode(unsigned long ad
extern int set_tsc_mode(unsigned int val);
/* Register/unregister a process' MPX related resource */
-#define MPX_ENABLE_MANAGEMENT(tsk) mpx_enable_management((tsk))
-#define MPX_DISABLE_MANAGEMENT(tsk) mpx_disable_management((tsk))
+#define MPX_ENABLE_MANAGEMENT() mpx_enable_management()
+#define MPX_DISABLE_MANAGEMENT() mpx_disable_management()
#ifdef CONFIG_X86_INTEL_MPX
-extern int mpx_enable_management(struct task_struct *tsk);
-extern int mpx_disable_management(struct task_struct *tsk);
+extern int mpx_enable_management(void);
+extern int mpx_disable_management(void);
#else
-static inline int mpx_enable_management(struct task_struct *tsk)
+static inline int mpx_enable_management(void)
{
return -EINVAL;
}
-static inline int mpx_disable_management(struct task_struct *tsk)
+static inline int mpx_disable_management(void)
{
return -EINVAL;
}
#endif /* CONFIG_X86_INTEL_MPX */
extern u16 amd_get_nb_id(int cpu);
+extern u32 amd_get_nodes_per_socket(void);
static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
{
--- a/arch/x86/include/mach-xen/asm/smp.h
+++ b/arch/x86/include/mach-xen/asm/smp.h
@@ -36,19 +36,7 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
-#endif
-static inline const struct cpumask *cpu_sibling_mask(int cpu)
-{
- return cpumask_of(cpu);
-}
-
-static inline const struct cpumask *cpu_core_mask(int cpu)
-{
- return cpumask_of(cpu);
-}
-
-#ifndef CONFIG_XEN
static inline struct cpumask *cpu_llc_shared_mask(int cpu)
{
return per_cpu(cpu_llc_shared_map, cpu);
--- a/arch/x86/include/mach-xen/asm/special_insns.h
+++ b/arch/x86/include/mach-xen/asm/special_insns.h
@@ -262,6 +262,44 @@ static inline void clwb(volatile void *_
: [pax] "a" (p));
}
+/**
+ * pcommit_sfence() - persistent commit and fence
+ *
+ * The PCOMMIT instruction ensures that data that has been flushed from the
+ * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to
+ * memory and is durable on the DIMM. The primary use case for this is
+ * persistent memory.
+ *
+ * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT
+ * with appropriate fencing.
+ *
+ * Example:
+ * void flush_and_commit_buffer(void *vaddr, unsigned int size)
+ * {
+ * unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ * void *vend = vaddr + size;
+ * void *p;
+ *
+ * for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+ * p < vend; p += boot_cpu_data.x86_clflush_size)
+ * clwb(p);
+ *
+ * // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes
+ * // MFENCE via mb() also works
+ * wmb();
+ *
+ * // PCOMMIT and the required SFENCE for ordering
+ * pcommit_sfence();
+ * }
+ *
+ * After this function completes the data pointed to by 'vaddr' has been
+ * accepted to memory and will be durable if the 'vaddr' points to persistent
+ * memory.
+ *
+ * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify
+ * things we include both the PCOMMIT and the required SFENCE in the
+ * alternatives generated by pcommit_sfence().
+ */
static inline void pcommit_sfence(void)
{
alternative(ASM_NOP7,
--- a/arch/x86/include/mach-xen/asm/spinlock.h
+++ b/arch/x86/include/mach-xen/asm/spinlock.h
@@ -37,6 +37,10 @@
# define UNLOCK_LOCK_PREFIX
#endif
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#else
+
#ifdef TICKET_SHIFT
/* How long a lock should spin before we consider blocking */
@@ -44,8 +48,6 @@
#include <asm/irqflags.h>
-int xen_spinlock_init(unsigned int cpu);
-void xen_spinlock_cleanup(unsigned int cpu);
#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
struct __raw_tickets xen_spin_adjust(const arch_spinlock_t *,
struct __raw_tickets);
@@ -191,9 +193,6 @@ static inline void __ticket_spin_unlock_
#else /* TICKET_SHIFT */
-static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
-static inline void xen_spinlock_cleanup(unsigned int cpu) {}
-
static __always_inline int __byte_spin_value_unlocked(arch_spinlock_t lock)
{
return lock.lock == 0;
@@ -297,6 +296,15 @@ static __always_inline void arch_spin_un
}
#undef __arch_spin
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+#if !defined(CONFIG_QUEUED_SPINLOCKS) && defined(TICKET_SHIFT)
+int xen_spinlock_init(unsigned int cpu);
+void xen_spinlock_cleanup(unsigned int cpu);
+#else
+static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
+static inline void xen_spinlock_cleanup(unsigned int cpu) {}
+#endif
/*
* Read-write spinlocks, allowing multiple readers
--- a/arch/x86/include/mach-xen/asm/spinlock_types.h
+++ b/arch/x86/include/mach-xen/asm/spinlock_types.h
@@ -3,6 +3,9 @@
#include <linux/types.h>
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#else
#ifdef CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
#define __TICKET_LOCK_INC 1
#define TICKET_SLOWPATH_FLAG ((__ticket_t)0)
@@ -49,6 +52,7 @@ typedef struct {
} arch_spinlock_t;
#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
#include <asm-generic/qrwlock_types.h>
--- a/arch/x86/kernel/apic/io_apic-xen.c
+++ b/arch/x86/kernel/apic/io_apic-xen.c
@@ -18,6 +18,16 @@
* and Rolf G. Tews
* for testing these extensively
* Paul Diefenbaugh : Added full ACPI support
+ *
+ * Historical information which is worth to be preserved:
+ *
+ * - SiS APIC rmw bug:
+ *
+ * We used to have a workaround for a bug in SiS chips which
+ * required to rewrite the index register for a read-modify-write
+ * operation as the chip lost the index information which was
+ * setup for the read already. We cache the data now, so that
+ * workaround has been removed.
*/
#include <linux/mm.h>
@@ -31,13 +41,13 @@
#include <linux/acpi.h>
#include <linux/module.h>
#include <linux/syscore_ops.h>
-#include <linux/irqdomain.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/jiffies.h> /* time_after() */
#include <linux/slab.h>
#include <linux/bootmem.h>
+#include <asm/irqdomain.h>
#include <asm/idle.h>
#include <asm/io.h>
#include <asm/smp.h>
@@ -84,27 +94,33 @@ unsigned long io_apic_irqs;
#define for_each_ioapic_pin(idx, pin) \
for_each_ioapic((idx)) \
for_each_pin((idx), (pin))
-
#define for_each_irq_pin(entry, head) \
list_for_each_entry(entry, &head, list)
-/*
- * Is the SiS APIC rmw bug present ?
- * -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
static DEFINE_RAW_SPINLOCK(ioapic_lock);
static DEFINE_MUTEX(ioapic_mutex);
static unsigned int ioapic_dynirq_base;
static int ioapic_initialized;
-struct mp_pin_info {
+struct irq_pin_list {
+ struct list_head list;
+ int apic, pin;
+};
+
+struct mp_chip_data {
+#ifndef CONFIG_XEN
+ struct list_head irq_2_pin;
+#endif
+ struct IO_APIC_route_entry entry;
int trigger;
int polarity;
- int node;
- int set;
u32 count;
+ bool isa_irq;
+};
+
+struct mp_ioapic_gsi {
+ u32 gsi_base;
+ u32 gsi_end;
};
static struct ioapic {
@@ -124,7 +140,6 @@ static struct ioapic {
struct mp_ioapic_gsi gsi_config;
struct ioapic_domain_cfg irqdomain_cfg;
struct irq_domain *irqdomain;
- struct mp_pin_info *pin_info;
struct resource *iomem_res;
} ioapics[MAX_IO_APICS];
@@ -140,7 +155,7 @@ unsigned int mpc_ioapic_addr(int ioapic_
return ioapics[ioapic_idx].mp_config.apicaddr;
}
-struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
{
return &ioapics[ioapic_idx].gsi_config;
}
@@ -152,11 +167,16 @@ static inline int mp_ioapic_pin_count(in
return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
}
-u32 mp_pin_to_gsi(int ioapic, int pin)
+static inline u32 mp_pin_to_gsi(int ioapic, int pin)
{
return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
}
+static inline bool mp_is_legacy_irq(int irq)
+{
+ return irq >= 0 && irq < nr_legacy_irqs();
+}
+
/*
* Initialize all legacy IRQs and all pins on the first IOAPIC
* if we have legacy interrupt controller. Kernel boot option "pirq="
@@ -167,12 +187,7 @@ static inline int mp_init_irq_at_boot(in
if (!nr_legacy_irqs())
return 0;
- return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs());
-}
-
-static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin)
-{
- return ioapics[ioapic_idx].pin_info + pin;
+ return ioapic == 0 || mp_is_legacy_irq(irq);
}
static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
@@ -240,16 +255,6 @@ void mp_save_irq(struct mpc_intsrc *m)
}
#ifndef CONFIG_XEN
-struct irq_pin_list {
- struct list_head list;
- int apic, pin;
-};
-
-static struct irq_pin_list *alloc_irq_pin_list(int node)
-{
- return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
-}
-
static void alloc_ioapic_saved_registers(int idx)
{
size_t size;
@@ -271,8 +276,7 @@ static void free_ioapic_saved_registers(
int __init arch_early_ioapic_init(void)
{
- struct irq_cfg *cfg;
- int i, node = cpu_to_node(0);
+ int i;
if (!nr_legacy_irqs())
io_apic_irqs = ~0UL;
@@ -280,16 +284,6 @@ int __init arch_early_ioapic_init(void)
for_each_ioapic(i)
alloc_ioapic_saved_registers(i);
- /*
- * For legacy IRQ's, start with assigning irq0 to irq15 to
- * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
- */
- for (i = 0; i < nr_legacy_irqs(); i++) {
- cfg = alloc_irq_and_cfg_at(i, node);
- cfg->vector = IRQ0_VECTOR + i;
- cpumask_setall(cfg->domain);
- }
-
return 0;
}
@@ -307,7 +301,7 @@ static __attribute_const__ struct io_api
+ (mpc_ioapic_addr(idx) & ~PAGE_MASK);
}
-void io_apic_eoi(unsigned int apic, unsigned int vector)
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
writel(vector, &io_apic->eoi);
@@ -320,28 +314,14 @@ unsigned int native_io_apic_read(unsigne
return readl(&io_apic->data);
}
-void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+static void io_apic_write(unsigned int apic, unsigned int reg,
+ unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
writel(reg, &io_apic->index);
writel(value, &io_apic->data);
}
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
-
- if (sis_apic_bug)
- writel(reg, &io_apic->index);
- writel(value, &io_apic->data);
-}
#else /* !CONFIG_XEN */
#define alloc_ioapic_saved_registers(idx)
#define free_ioapic_saved_registers(idx)
@@ -368,8 +348,6 @@ static inline void io_apic_write(unsigne
apic_op.value = value;
WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
}
-
-#define io_apic_modify io_apic_write
#endif /* !CONFIG_XEN */
union entry_union {
@@ -429,7 +407,6 @@ static void ioapic_write_entry(int apic,
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-#ifndef CONFIG_XEN
/*
* When we mask an IO APIC routing entry, we need to write the low
* word first, in order to set the mask bit before we change the
@@ -438,29 +415,33 @@ static void ioapic_write_entry(int apic,
static void ioapic_mask_entry(int apic, int pin)
{
unsigned long flags;
- union entry_union eu = { .entry.mask = 1 };
+ union entry_union eu = { .entry.mask = IOAPIC_MASKED };
raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+#ifndef CONFIG_XEN
io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+#endif
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
+#ifndef CONFIG_XEN
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static int __add_pin_to_irq_node(struct mp_chip_data *data,
+ int node, int apic, int pin)
{
struct irq_pin_list *entry;
/* don't allow duplicates */
- for_each_irq_pin(entry, cfg->irq_2_pin)
+ for_each_irq_pin(entry, data->irq_2_pin)
if (entry->apic == apic && entry->pin == pin)
return 0;
- entry = alloc_irq_pin_list(node);
+ entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
if (!entry) {
pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
node, apic, pin);
@@ -468,16 +449,16 @@ static int __add_pin_to_irq_node(struct
}
entry->apic = apic;
entry->pin = pin;
+ list_add_tail(&entry->list, &data->irq_2_pin);
- list_add_tail(&entry->list, &cfg->irq_2_pin);
return 0;
}
-static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
+static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
{
struct irq_pin_list *tmp, *entry;
- list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list)
+ list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list)
if (entry->apic == apic && entry->pin == pin) {
list_del(&entry->list);
kfree(entry);
@@ -485,22 +466,23 @@ static void __remove_pin_from_irq(struct
}
}
-static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static void add_pin_to_irq_node(struct mp_chip_data *data,
+ int node, int apic, int pin)
{
- if (__add_pin_to_irq_node(cfg, node, apic, pin))
+ if (__add_pin_to_irq_node(data, node, apic, pin))
panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
}
/*
* Reroute an IRQ to a different pin.
*/
-static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
int oldapic, int oldpin,
int newapic, int newpin)
{
struct irq_pin_list *entry;
- for_each_irq_pin(entry, cfg->irq_2_pin) {
+ for_each_irq_pin(entry, data->irq_2_pin) {
if (entry->apic == oldapic && entry->pin == oldpin) {
entry->apic = newapic;
entry->pin = newpin;
@@ -510,32 +492,26 @@ static void __init replace_pin_at_irq_no
}
/* old apic/pin didn't exist, so just add new ones */
- add_pin_to_irq_node(cfg, node, newapic, newpin);
+ add_pin_to_irq_node(data, node, newapic, newpin);
}
-static void __io_apic_modify_irq(struct irq_pin_list *entry,
- int mask_and, int mask_or,
- void (*final)(struct irq_pin_list *entry))
-{
- unsigned int reg, pin;
-
- pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin * 2);
- reg &= mask_and;
- reg |= mask_or;
- io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
- if (final)
- final(entry);
-}
-
-static void io_apic_modify_irq(struct irq_cfg *cfg,
+static void io_apic_modify_irq(struct mp_chip_data *data,
int mask_and, int mask_or,
void (*final)(struct irq_pin_list *entry))
{
+ union entry_union eu;
struct irq_pin_list *entry;
- for_each_irq_pin(entry, cfg->irq_2_pin)
- __io_apic_modify_irq(entry, mask_and, mask_or, final);
+ eu.entry = data->entry;
+ eu.w1 &= mask_and;
+ eu.w1 |= mask_or;
+ data->entry = eu.entry;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+ if (final)
+ final(entry);
+ }
}
static void io_apic_sync(struct irq_pin_list *entry)
@@ -550,39 +526,31 @@ static void io_apic_sync(struct irq_pin_
readl(&io_apic->data);
}
-static void mask_ioapic(struct irq_cfg *cfg)
+static void mask_ioapic_irq(struct irq_data *irq_data)
{
+ struct mp_chip_data *data = irq_data->chip_data;
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void mask_ioapic_irq(struct irq_data *data)
+static void __unmask_ioapic(struct mp_chip_data *data)
{
- mask_ioapic(irqd_cfg(data));
+ io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
}
-static void __unmask_ioapic(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
-static void unmask_ioapic(struct irq_cfg *cfg)
+static void unmask_ioapic_irq(struct irq_data *irq_data)
{
+ struct mp_chip_data *data = irq_data->chip_data;
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_ioapic(cfg);
+ __unmask_ioapic(data);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void unmask_ioapic_irq(struct irq_data *data)
-{
- unmask_ioapic(irqd_cfg(data));
-}
-
/*
* IO-APIC versions below 0x20 don't support EOI register.
* For the record, here is the information about various versions:
@@ -599,7 +567,7 @@ static void unmask_ioapic_irq(struct irq
* Otherwise, we simulate the EOI message manually by changing the trigger
* mode to edge and then back to level, with RTE being masked during this.
*/
-void native_eoi_ioapic_pin(int apic, int pin, int vector)
+static void __eoi_ioapic_pin(int apic, int pin, int vector)
{
if (mpc_ioapic_ver(apic) >= 0x20) {
io_apic_eoi(apic, vector);
@@ -611,7 +579,7 @@ void native_eoi_ioapic_pin(int apic, int
/*
* Mask the entry and change the trigger mode to edge.
*/
- entry1.mask = 1;
+ entry1.mask = IOAPIC_MASKED;
entry1.trigger = IOAPIC_EDGE;
__ioapic_write_entry(apic, pin, entry1);
@@ -623,15 +591,14 @@ void native_eoi_ioapic_pin(int apic, int
}
}
-void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
{
- struct irq_pin_list *entry;
unsigned long flags;
+ struct irq_pin_list *entry;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- for_each_irq_pin(entry, cfg->irq_2_pin)
- x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin,
- cfg->vector);
+ for_each_irq_pin(entry, data->irq_2_pin)
+ __eoi_ioapic_pin(entry->apic, entry->pin, vector);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -648,8 +615,8 @@ static void clear_IO_APIC_pin(unsigned i
* Make sure the entry is masked and re-read the contents to check
* if it is a level triggered pin and if the remote-IRR is set.
*/
- if (!entry.mask) {
- entry.mask = 1;
+ if (entry.mask == IOAPIC_UNMASKED) {
+ entry.mask = IOAPIC_MASKED;
ioapic_write_entry(apic, pin, entry);
entry = ioapic_read_entry(apic, pin);
}
@@ -662,13 +629,12 @@ static void clear_IO_APIC_pin(unsigned i
* doesn't clear the remote-IRR if the trigger mode is not
* set to level.
*/
- if (!entry.trigger) {
+ if (entry.trigger == IOAPIC_EDGE) {
entry.trigger = IOAPIC_LEVEL;
ioapic_write_entry(apic, pin, entry);
}
-
raw_spin_lock_irqsave(&ioapic_lock, flags);
- x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector);
+ __eoi_ioapic_pin(apic, pin, entry.vector);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -771,8 +737,8 @@ void mask_ioapic_entries(void)
struct IO_APIC_route_entry entry;
entry = ioapics[apic].saved_registers[pin];
- if (!entry.mask) {
- entry.mask = 1;
+ if (entry.mask == IOAPIC_UNMASKED) {
+ entry.mask = IOAPIC_MASKED;
ioapic_write_entry(apic, pin, entry);
}
}
@@ -877,11 +843,11 @@ static int EISA_ELCR(unsigned int irq)
#endif
-/* ISA interrupts are always polarity zero edge triggered,
+/* ISA interrupts are always active high edge triggered,
* when listed as conforming in the MP table. */
-#define default_ISA_trigger(idx) (0)
-#define default_ISA_polarity(idx) (0)
+#define default_ISA_trigger(idx) (IOAPIC_EDGE)
+#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH)
/* EISA interrupts are always polarity zero and can be edge or level
* trigger depending on the ELCR value. If an interrupt is listed as
@@ -891,53 +857,55 @@ static int EISA_ELCR(unsigned int irq)
#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
#define default_EISA_polarity(idx) default_ISA_polarity(idx)
-/* PCI interrupts are always polarity one level triggered,
+/* PCI interrupts are always active low level triggered,
* when listed as conforming in the MP table. */
-#define default_PCI_trigger(idx) (1)
-#define default_PCI_polarity(idx) (1)
+#define default_PCI_trigger(idx) (IOAPIC_LEVEL)
+#define default_PCI_polarity(idx) (IOAPIC_POL_LOW)
static int irq_polarity(int idx)
{
int bus = mp_irqs[idx].srcbus;
- int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].irqflag & 3)
- {
- case 0: /* conforms, ie. bus-type dependent polarity */
- if (test_bit(bus, mp_bus_not_pci))
- polarity = default_ISA_polarity(idx);
- else
- polarity = default_PCI_polarity(idx);
- break;
- case 1: /* high active */
- {
- polarity = 0;
- break;
- }
- case 2: /* reserved */
- {
- pr_warn("broken BIOS!!\n");
- polarity = 1;
- break;
- }
- case 3: /* low active */
- {
- polarity = 1;
- break;
- }
- default: /* invalid */
- {
- pr_warn("broken BIOS!!\n");
- polarity = 1;
- break;
- }
+ switch (mp_irqs[idx].irqflag & 0x03) {
+ case 0:
+ /* conforms to spec, ie. bus-type dependent polarity */
+ if (test_bit(bus, mp_bus_not_pci))
+ return default_ISA_polarity(idx);
+ else
+ return default_PCI_polarity(idx);
+ case 1:
+ return IOAPIC_POL_HIGH;
+ case 2:
+ pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+ case 3:
+ default: /* Pointless default required due to do gcc stupidity */
+ return IOAPIC_POL_LOW;
+ }
+}
+
+#ifdef CONFIG_EISA
+static int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_PCI:
+ case MP_BUS_ISA:
+ return trigger;
+ case MP_BUS_EISA:
+ return default_EISA_trigger(idx);
}
- return polarity;
+ pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
+ return IOAPIC_LEVEL;
+}
+#else
+static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+ return trigger;
}
+#endif
static int irq_trigger(int idx)
{
@@ -947,153 +915,229 @@ static int irq_trigger(int idx)
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
- switch ((mp_irqs[idx].irqflag>>2) & 3)
- {
- case 0: /* conforms, ie. bus-type dependent */
- if (test_bit(bus, mp_bus_not_pci))
- trigger = default_ISA_trigger(idx);
- else
- trigger = default_PCI_trigger(idx);
-#ifdef CONFIG_EISA
- switch (mp_bus_id_to_type[bus]) {
- case MP_BUS_ISA: /* ISA pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- trigger = default_EISA_trigger(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /* set before the switch */
- break;
- }
- default:
- {
- pr_warn("broken BIOS!!\n");
- trigger = 1;
- break;
- }
- }
-#endif
- break;
- case 1: /* edge */
- {
- trigger = 0;
- break;
- }
- case 2: /* reserved */
- {
- pr_warn("broken BIOS!!\n");
- trigger = 1;
- break;
- }
- case 3: /* level */
- {
- trigger = 1;
- break;
- }
- default: /* invalid */
- {
- pr_warn("broken BIOS!!\n");
- trigger = 0;
- break;
+ switch ((mp_irqs[idx].irqflag >> 2) & 0x03) {
+ case 0:
+ /* conforms to spec, ie. bus-type dependent trigger mode */
+ if (test_bit(bus, mp_bus_not_pci))
+ trigger = default_ISA_trigger(idx);
+ else
+ trigger = default_PCI_trigger(idx);
+ /* Take EISA into account */
+ return eisa_irq_trigger(idx, bus, trigger);
+ case 1:
+ return IOAPIC_EDGE;
+ case 2:
+ pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+ case 3:
+ default: /* Pointless default required due to do gcc stupidity */
+ return IOAPIC_LEVEL;
+ }
+}
+
+void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
+ int trigger, int polarity)
+{
+ init_irq_alloc_info(info, NULL);
+ info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ info->ioapic_node = node;
+ info->ioapic_trigger = trigger;
+ info->ioapic_polarity = polarity;
+ info->ioapic_valid = 1;
+}
+
+#ifndef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
+#endif
+
+static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
+ struct irq_alloc_info *src,
+ u32 gsi, int ioapic_idx, int pin)
+{
+ int trigger, polarity;
+
+ copy_irq_alloc_info(dst, src);
+ dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ dst->ioapic_id = mpc_ioapic_id(ioapic_idx);
+ dst->ioapic_pin = pin;
+ dst->ioapic_valid = 1;
+ if (src && src->ioapic_valid) {
+ dst->ioapic_node = src->ioapic_node;
+ dst->ioapic_trigger = src->ioapic_trigger;
+ dst->ioapic_polarity = src->ioapic_polarity;
+ } else {
+ dst->ioapic_node = NUMA_NO_NODE;
+ if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
+ dst->ioapic_trigger = trigger;
+ dst->ioapic_polarity = polarity;
+ } else {
+ /*
+ * PCI interrupts are always active low level
+ * triggered.
+ */
+ dst->ioapic_trigger = IOAPIC_LEVEL;
+ dst->ioapic_polarity = IOAPIC_POL_LOW;
}
}
- return trigger;
}
-static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin)
+static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
+{
+ return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE;
+}
+
+static void mp_register_handler(unsigned int irq, unsigned long trigger)
+{
+#ifndef CONFIG_XEN
+ irq_flow_handler_t hdl;
+ bool fasteoi;
+
+ if (trigger) {
+ irq_set_status_flags(irq, IRQ_LEVEL);
+ fasteoi = true;
+ } else {
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ fasteoi = false;
+ }
+
+ hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+ __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge");
+#endif
+}
+
+static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
{
+ struct mp_chip_data *data = irq_get_chip_data(irq);
+
+ /*
+ * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
+ * and polarity attirbutes. So allow the first user to reprogram the
+ * pin with real trigger and polarity attributes.
+ */
+ if (irq < nr_legacy_irqs() && data->count == 1) {
+ if (info->ioapic_trigger != data->trigger)
+ mp_register_handler(irq, info->ioapic_trigger);
+ data->entry.trigger = data->trigger = info->ioapic_trigger;
+ data->entry.polarity = data->polarity = info->ioapic_polarity;
+ }
+
+ return data->trigger == info->ioapic_trigger &&
+ data->polarity == info->ioapic_polarity;
+}
+
+static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
+ struct irq_alloc_info *info)
+{
+ bool legacy = false;
int irq = -1;
- int ioapic = (int)(long)domain->host_data;
int type = ioapics[ioapic].irqdomain_cfg.type;
switch (type) {
case IOAPIC_DOMAIN_LEGACY:
/*
- * Dynamically allocate IRQ number for non-ISA IRQs in the first 16
- * GSIs on some weird platforms.
+ * Dynamically allocate IRQ number for non-ISA IRQs in the first
+ * 16 GSIs on some weird platforms.
*/
- if (gsi < nr_legacy_irqs())
- irq = irq_create_mapping(domain, pin);
- else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+ if (!ioapic_initialized || gsi >= nr_legacy_irqs())
irq = gsi;
+ legacy = mp_is_legacy_irq(irq);
break;
case IOAPIC_DOMAIN_STRICT:
- if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
- irq = gsi;
+ irq = gsi;
break;
case IOAPIC_DOMAIN_DYNAMIC:
- irq = irq_create_mapping(domain, pin);
break;
default:
WARN(1, "ioapic: unknown irqdomain type %d\n", type);
- break;
+ return -1;
}
- return irq > 0 ? irq : -1;
+ return __irq_domain_alloc_irqs(domain, irq, 1,
+ ioapic_alloc_attr_node(info),
+ info, legacy);
+}
+
+/*
+ * Need special handling for ISA IRQs because there may be multiple IOAPIC pins
+ * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping
+ * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are
+ * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and
+ * some BIOSes may use MP Interrupt Source records to override IRQ numbers for
+ * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
+ * multiple pins sharing the same legacy IRQ number when ACPI is disabled.
+ */
+static int alloc_isa_irq_from_domain(struct irq_domain *domain,
+ int irq, int ioapic, int pin,
+ struct irq_alloc_info *info)
+{
+ struct mp_chip_data *data;
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ int node = ioapic_alloc_attr_node(info);
+
+ /*
+ * Legacy ISA IRQ has already been allocated, just add pin to
+ * the pin list assoicated with this IRQ and program the IOAPIC
+ * entry. The IOAPIC entry
+ */
+ if (irq_data && irq_data->parent_data) {
+ if (!mp_check_pin_attr(irq, info))
+ return -EBUSY;
+ if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
+ info->ioapic_pin))
+ return -ENOMEM;
+ } else {
+ irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true);
+ if (irq >= 0) {
+ irq_data = irq_domain_get_irq_data(domain, irq);
+ data = irq_data->chip_data;
+ data->isa_irq = true;
+ }
+ }
+
+ return irq;
}
static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
- unsigned int flags)
+ unsigned int flags, struct irq_alloc_info *info)
{
- int irq;
+ int irq = -ENOENT;
+ bool legacy = false;
+ struct irq_alloc_info tmp;
+ struct mp_chip_data *data;
struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
- struct mp_pin_info *info = mp_pin_info(ioapic, pin);
if (!domain)
- return -1;
-
- mutex_lock(&ioapic_mutex);
+ return -ENOSYS;
- /*
- * Don't use irqdomain to manage ISA IRQs because there may be
- * multiple IOAPIC pins sharing the same ISA IRQ number and
- * irqdomain only supports 1:1 mapping between IOAPIC pin and
- * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used
- * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
- * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are
- * available, and some BIOSes may use MP Interrupt Source records
- * to override IRQ numbers for PIRQs instead of reprogramming
- * the interrupt routing logic. Thus there may be multiple pins
- * sharing the same legacy IRQ number when ACPI is disabled.
- */
if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
irq = mp_irqs[idx].srcbusirq;
- if (flags & IOAPIC_MAP_ALLOC) {
- if (info->count == 0 &&
- mp_irqdomain_map(domain, irq, pin) != 0)
- irq = -1;
+ legacy = mp_is_legacy_irq(irq);
+ }
- /* special handling for timer IRQ0 */
+ mutex_lock(&ioapic_mutex);
+ if (!(flags & IOAPIC_MAP_ALLOC)) {
+ if (!legacy) {
+ irq = irq_find_mapping(domain, pin);
if (irq == 0)
- info->count++;
+ irq = -ENOENT;
}
} else {
- irq = irq_find_mapping(domain, pin);
- if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC))
- irq = alloc_irq_from_domain(domain, gsi, pin);
- }
-
- if (flags & IOAPIC_MAP_ALLOC) {
- /* special handling for legacy IRQs */
- if (irq < nr_legacy_irqs() && info->count == 1 &&
- mp_irqdomain_map(domain, irq, pin) != 0)
- irq = -1;
-
- if (irq > 0)
- info->count++;
- else if (info->count == 0)
- info->set = 0;
+ ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin);
+ if (legacy)
+ irq = alloc_isa_irq_from_domain(domain, irq,
+ ioapic, pin, &tmp);
+ else if ((irq = irq_find_mapping(domain, pin)) == 0)
+ irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp);
+ else if (!mp_check_pin_attr(irq, &tmp))
+ irq = -EBUSY;
+ if (irq >= 0) {
+ data = irq_get_chip_data(irq);
+ data->count++;
+ }
}
-
mutex_unlock(&ioapic_mutex);
- return irq > 0 ? irq : -1;
+ return irq;
}
static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
@@ -1126,10 +1170,10 @@ static int pin_2_irq(int idx, int ioapic
}
#endif
- return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL);
}
-int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
{
int ioapic, pin, idx;
@@ -1142,31 +1186,24 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned
if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
return -1;
- return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
}
void mp_unmap_irq(int irq)
{
- struct irq_data *data = irq_get_irq_data(irq);
- struct mp_pin_info *info;
- int ioapic, pin;
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ struct mp_chip_data *data;
- if (!data || !data->domain)
+ if (!irq_data || !irq_data->domain)
return;
- ioapic = (int)(long)data->domain->host_data;
- pin = (int)data->hwirq;
- info = mp_pin_info(ioapic, pin);
+ data = irq_data->chip_data;
+ if (!data || data->isa_irq)
+ return;
mutex_lock(&ioapic_mutex);
- if (--info->count == 0) {
- info->set = 0;
- if (irq < nr_legacy_irqs() &&
- ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY)
- mp_irqdomain_unmap(data->domain, irq);
- else
- irq_dispose_mapping(irq);
- }
+ if (--data->count == 0)
+ irq_domain_free_irqs(irq, 1);
mutex_unlock(&ioapic_mutex);
}
@@ -1234,7 +1271,7 @@ out:
EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
#ifndef CONFIG_XEN
-static struct irq_chip ioapic_chip;
+static struct irq_chip ioapic_chip, ioapic_ir_chip;
#ifdef CONFIG_X86_32
static inline int IO_APIC_irq_trigger(int irq)
@@ -1257,114 +1294,7 @@ static inline int IO_APIC_irq_trigger(in
return 1;
}
#endif
-
-static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
- unsigned long trigger)
-{
- struct irq_chip *chip = &ioapic_chip;
- irq_flow_handler_t hdl;
- bool fasteoi;
-
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL) {
- irq_set_status_flags(irq, IRQ_LEVEL);
- fasteoi = true;
- } else {
- irq_clear_status_flags(irq, IRQ_LEVEL);
- fasteoi = false;
- }
-
- if (setup_remapped_irq(irq, cfg, chip))
- fasteoi = trigger != 0;
-
- hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
- irq_set_chip_and_handler_name(irq, chip, hdl,
- fasteoi ? "fasteoi" : "edge");
-}
-#endif
-
-#ifndef CONFIG_XEN
-int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
-#else
-static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
#endif
- unsigned int destination, int vector,
- struct io_apic_irq_attr *attr)
-{
- memset(entry, 0, sizeof(*entry));
-
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->dest = destination;
- entry->vector = vector;
- entry->mask = 0; /* enable IRQ */
- entry->trigger = attr->trigger;
- entry->polarity = attr->polarity;
-
- /*
- * Mask level triggered irqs.
- * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
- */
- if (attr->trigger)
- entry->mask = 1;
-
- return 0;
-}
-
-static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
- struct io_apic_irq_attr *attr)
-{
- struct IO_APIC_route_entry entry;
- unsigned int dest;
-
- if (!IO_APIC_IRQ(irq))
- return;
-
- if (assign_irq_vector(irq, cfg, apic->target_cpus()))
- return;
-
-#ifndef CONFIG_XEN
- if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
- &dest)) {
- pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
- mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
- clear_irq_vector(irq, cfg);
-
- return;
- }
-#else
- dest = 0; /* meaningless */
-#endif
-
- apic_printk(APIC_VERBOSE,KERN_DEBUG
- "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
- "IRQ %d Mode:%i Active:%i Dest:%d)\n",
- attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
- cfg->vector, irq, attr->trigger, attr->polarity, dest);
-
-#ifndef CONFIG_XEN
- if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) {
-#else
- if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
-#endif
- pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
- mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
- clear_irq_vector(irq, cfg);
-
- return;
- }
-
-#ifndef CONFIG_XEN
- ioapic_register_intr(irq, cfg, attr->trigger);
- if (irq < nr_legacy_irqs())
- legacy_pic->mask(irq);
-#else
- evtchn_register_pirq(irq, mp_pin_to_gsi(attr->ioapic,
- attr->ioapic_pin));
-#endif
-
- ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
-}
static void __init setup_IO_APIC_irqs(void)
{
@@ -1386,106 +1316,41 @@ static void __init setup_IO_APIC_irqs(vo
}
#ifndef CONFIG_XEN
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
- unsigned int pin, int vector)
-{
- struct IO_APIC_route_entry entry;
- unsigned int dest;
-
- memset(&entry, 0, sizeof(entry));
-
- /*
- * We use logical delivery to get the timer IRQ
- * to the first CPU.
- */
- if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
- apic->target_cpus(), &dest)))
- dest = BAD_APICID;
-
- entry.dest_mode = apic->irq_dest_mode;
- entry.mask = 0; /* don't mask IRQ for edge */
- entry.dest = dest;
- entry.delivery_mode = apic->irq_delivery_mode;
- entry.polarity = 0;
- entry.trigger = 0;
- entry.vector = vector;
-
- /*
- * The timer IRQ doesn't have to know that behind the
- * scene we may have a 8259A-master in AEOI mode ...
- */
- irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
- "edge");
-
- /*
- * Add it to the IO-APIC irq-routing table:
- */
- ioapic_write_entry(ioapic_idx, pin, entry);
-}
-
-void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
+void ioapic_zap_locks(void)
{
- int i;
-
- pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n");
-
- for (i = 0; i <= nr_entries; i++) {
- struct IO_APIC_route_entry entry;
-
- entry = ioapic_read_entry(apic, i);
-
- pr_debug(" %02x %02X ", i, entry.dest);
- pr_cont("%1d %1d %1d %1d %1d "
- "%1d %1d %02X\n",
- entry.mask,
- entry.trigger,
- entry.irr,
- entry.polarity,
- entry.delivery_status,
- entry.dest_mode,
- entry.delivery_mode,
- entry.vector);
- }
+ raw_spin_lock_init(&ioapic_lock);
}
-void intel_ir_io_apic_print_entries(unsigned int apic,
- unsigned int nr_entries)
+static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
{
int i;
+ char buf[256];
+ struct IO_APIC_route_entry entry;
+ struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
- pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n");
-
+ printk(KERN_DEBUG "IOAPIC %d:\n", apic);
for (i = 0; i <= nr_entries; i++) {
- struct IR_IO_APIC_route_entry *ir_entry;
- struct IO_APIC_route_entry entry;
-
entry = ioapic_read_entry(apic, i);
-
- ir_entry = (struct IR_IO_APIC_route_entry *)&entry;
-
- pr_debug(" %02x %04X ", i, ir_entry->index);
- pr_cont("%1d %1d %1d %1d %1d "
- "%1d %1d %X %02X\n",
- ir_entry->format,
- ir_entry->mask,
- ir_entry->trigger,
- ir_entry->irr,
- ir_entry->polarity,
- ir_entry->delivery_status,
- ir_entry->index2,
- ir_entry->zero,
- ir_entry->vector);
+ snprintf(buf, sizeof(buf),
+ " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+ i,
+ entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
+ entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
+ entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+ entry.vector, entry.irr, entry.delivery_status);
+ if (ir_entry->format)
+ printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n",
+ buf, (ir_entry->index << 15) | ir_entry->index,
+ ir_entry->zero);
+ else
+ printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
+ buf,
+ entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
+ "logical " : "physical",
+ entry.dest, entry.delivery_mode);
}
}
-void ioapic_zap_locks(void)
-{
- raw_spin_lock_init(&ioapic_lock);
-}
-
static void __init print_IO_APIC(int ioapic_idx)
{
union IO_APIC_reg_00 reg_00;
@@ -1539,16 +1404,13 @@ static void __init print_IO_APIC(int ioa
}
printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
- x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries);
+ io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
}
void __init print_IO_APICs(void)
{
int ioapic_idx;
- struct irq_cfg *cfg;
unsigned int irq;
- struct irq_chip *chip;
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for_each_ioapic(ioapic_idx)
@@ -1568,18 +1430,20 @@ void __init print_IO_APICs(void)
printk(KERN_DEBUG "IRQ to pin mappings:\n");
for_each_active_irq(irq) {
struct irq_pin_list *entry;
+ struct irq_chip *chip;
+ struct mp_chip_data *data;
chip = irq_get_chip(irq);
- if (chip != &ioapic_chip)
+ if (chip != &ioapic_chip && chip != &ioapic_ir_chip)
continue;
-
- cfg = irq_cfg(irq);
- if (!cfg)
+ data = irq_get_chip_data(irq);
+ if (!data)
continue;
- if (list_empty(&cfg->irq_2_pin))
+ if (list_empty(&data->irq_2_pin))
continue;
+
printk(KERN_DEBUG "IRQ%d ", irq);
- for_each_irq_pin(entry, cfg->irq_2_pin)
+ for_each_irq_pin(entry, data->irq_2_pin)
pr_cont("-> %d:%d", entry->apic, entry->pin);
pr_cont("\n");
}
@@ -1652,15 +1516,12 @@ void native_disable_io_apic(void)
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
- entry.mask = 0; /* Enabled */
- entry.trigger = 0; /* Edge */
- entry.irr = 0;
- entry.polarity = 0; /* High */
- entry.delivery_status = 0;
- entry.dest_mode = 0; /* Physical */
- entry.delivery_mode = dest_ExtINT; /* ExtInt */
- entry.vector = 0;
- entry.dest = read_apic_id();
+ entry.mask = IOAPIC_UNMASKED;
+ entry.trigger = IOAPIC_EDGE;
+ entry.polarity = IOAPIC_POL_HIGH;
+ entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+ entry.delivery_mode = dest_ExtINT;
+ entry.dest = read_apic_id();
/*
* Add it to the IO-APIC irq-routing table:
@@ -1670,7 +1531,6 @@ void native_disable_io_apic(void)
if (cpu_has_apic || apic_from_smp_config())
disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-
}
/*
@@ -1880,7 +1740,6 @@ static int __init timer_irq_works(void)
* This is not complete - we should be able to fake
* an edge even if it isn't on the 8259A...
*/
-
static unsigned int startup_ioapic_irq(struct irq_data *data)
{
int was_pending = 0, irq = data->irq;
@@ -1892,74 +1751,22 @@ static unsigned int startup_ioapic_irq(s
if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
- __unmask_ioapic(irqd_cfg(data));
+ __unmask_ioapic(data->chip_data);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
}
-/*
- * Level and edge triggered IO-APIC interrupts need different handling,
- * so we use two separate IRQ descriptors. Edge triggered IRQs can be
- * handled with the level-triggered descriptor, but that one has slightly
- * more overhead. Level-triggered interrupts cannot be handled with the
- * edge-triggered handler, without risking IRQ storms and other ugly
- * races.
- */
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
- int apic, pin;
- struct irq_pin_list *entry;
- u8 vector = cfg->vector;
-
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- unsigned int reg;
-
- apic = entry->apic;
- pin = entry->pin;
-
- io_apic_write(apic, 0x11 + pin*2, dest);
- reg = io_apic_read(apic, 0x10 + pin*2);
- reg &= ~IO_APIC_REDIR_VECTOR_MASK;
- reg |= vector;
- io_apic_modify(apic, 0x10 + pin*2, reg);
- }
-}
-
-int native_ioapic_set_affinity(struct irq_data *data,
- const struct cpumask *mask,
- bool force)
-{
- unsigned int dest, irq = data->irq;
- unsigned long flags;
- int ret;
-
- if (!config_enabled(CONFIG_SMP))
- return -EPERM;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- ret = apic_set_affinity(data, mask, &dest);
- if (!ret) {
- /* Only the high 8 bits are valid. */
- dest = SET_APIC_LOGICAL_ID(dest);
- __target_IO_APIC_irq(irq, dest, irqd_cfg(data));
- ret = IRQ_SET_MASK_OK_NOCOPY;
- }
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- return ret;
-}
-
atomic_t irq_mis_count;
#ifdef CONFIG_GENERIC_PENDING_IRQ
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+static bool io_apic_level_ack_pending(struct mp_chip_data *data)
{
struct irq_pin_list *entry;
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- for_each_irq_pin(entry, cfg->irq_2_pin) {
+ for_each_irq_pin(entry, data->irq_2_pin) {
unsigned int reg;
int pin;
@@ -1976,18 +1783,17 @@ static bool io_apic_level_ack_pending(st
return false;
}
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
{
/* If we are moving the irq we need to mask it */
if (unlikely(irqd_is_setaffinity_pending(data))) {
- mask_ioapic(cfg);
+ mask_ioapic_irq(data);
return true;
}
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data,
- struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
{
if (unlikely(masked)) {
/* Only migrate the irq if the ack has been received.
@@ -2016,31 +1822,30 @@ static inline void ioapic_irqd_unmask(st
* accurate and is causing problems then it is a hardware bug
* and you can go talk to the chipset vendor about it.
*/
- if (!io_apic_level_ack_pending(cfg))
+ if (!io_apic_level_ack_pending(data->chip_data))
irq_move_masked_irq(data);
- unmask_ioapic(cfg);
+ unmask_ioapic_irq(data);
}
}
#else
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
{
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data,
- struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
{
}
#endif
-static void ack_ioapic_level(struct irq_data *data)
+static void ioapic_ack_level(struct irq_data *irq_data)
{
- struct irq_cfg *cfg = irqd_cfg(data);
- int i, irq = data->irq;
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
unsigned long v;
bool masked;
+ int i;
irq_complete_move(cfg);
- masked = ioapic_irqd_mask(data, cfg);
+ masked = ioapic_irqd_mask(irq_data);
/*
* It appears there is an erratum which affects at least version 0x11
@@ -2092,11 +1897,49 @@ static void ack_ioapic_level(struct irq_
*/
if (!(v & (1 << (i & 0x1f)))) {
atomic_inc(&irq_mis_count);
+ eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
+ }
+
+ ioapic_irqd_unmask(irq_data, masked);
+}
+
+static void ioapic_ir_ack_level(struct irq_data *irq_data)
+{
+ struct mp_chip_data *data = irq_data->chip_data;
+
+ /*
+ * Intr-remapping uses pin number as the virtual vector
+ * in the RTE. Actual vector is programmed in
+ * intr-remapping table entry. Hence for the io-apic
+ * EOI we use the pin number.
+ */
+ ack_APIC_irq();
+ eoi_ioapic_pin(data->entry.vector, data);
+}
+
+static int ioapic_set_affinity(struct irq_data *irq_data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_data *parent = irq_data->parent_data;
+ struct mp_chip_data *data = irq_data->chip_data;
+ struct irq_pin_list *entry;
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ int ret;
- eoi_ioapic_irq(irq, cfg);
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
+ cfg = irqd_cfg(irq_data);
+ data->entry.dest = cfg->dest_apicid;
+ data->entry.vector = cfg->vector;
+ for_each_irq_pin(entry, data->irq_2_pin)
+ __ioapic_write_entry(entry->apic, entry->pin,
+ data->entry);
}
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- ioapic_irqd_unmask(data, cfg, masked);
+ return ret;
}
static struct irq_chip ioapic_chip __read_mostly = {
@@ -2104,10 +1947,20 @@ static struct irq_chip ioapic_chip __rea
.irq_startup = startup_ioapic_irq,
.irq_mask = mask_ioapic_irq,
.irq_unmask = unmask_ioapic_irq,
- .irq_ack = apic_ack_edge,
- .irq_eoi = ack_ioapic_level,
- .irq_set_affinity = native_ioapic_set_affinity,
- .irq_retrigger = apic_retrigger_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_eoi = ioapic_ack_level,
+ .irq_set_affinity = ioapic_set_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct irq_chip ioapic_ir_chip __read_mostly = {
+ .name = "IR-IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_eoi = ioapic_ir_ack_level,
+ .irq_set_affinity = ioapic_set_affinity,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
#endif /* !CONFIG_XEN */
@@ -2207,12 +2060,12 @@ static inline void __init unlock_ExtINT_
memset(&entry1, 0, sizeof(entry1));
- entry1.dest_mode = 0; /* physical delivery */
- entry1.mask = 0; /* unmask IRQ now */
+ entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+ entry1.mask = IOAPIC_UNMASKED;
entry1.dest = hard_smp_processor_id();
entry1.delivery_mode = dest_ExtINT;
entry1.polarity = entry0.polarity;
- entry1.trigger = 0;
+ entry1.trigger = IOAPIC_EDGE;
entry1.vector = 0;
ioapic_write_entry(apic, pin, entry1);
@@ -2246,6 +2099,25 @@ static int __init disable_timer_pin_setu
}
early_param("disable_timer_pin_1", disable_timer_pin_setup);
+static int mp_alloc_timer_irq(int ioapic, int pin)
+{
+ int irq = -1;
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+
+ if (domain) {
+ struct irq_alloc_info info;
+
+ ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
+ info.ioapic_id = mpc_ioapic_id(ioapic);
+ info.ioapic_pin = pin;
+ mutex_lock(&ioapic_mutex);
+ irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
+ mutex_unlock(&ioapic_mutex);
+ }
+
+ return irq;
+}
+
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
@@ -2256,7 +2128,9 @@ early_param("disable_timer_pin_1", disab
*/
static inline void __init check_timer(void)
{
- struct irq_cfg *cfg = irq_cfg(0);
+ struct irq_data *irq_data = irq_get_irq_data(0);
+ struct mp_chip_data *data = irq_data->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
int node = cpu_to_node(0);
int apic1, pin1, apic2, pin2;
unsigned long flags;
@@ -2268,7 +2142,6 @@ static inline void __init check_timer(vo
* get/set the timer IRQ vector:
*/
legacy_pic->mask(0);
- assign_irq_vector(0, cfg, apic->target_cpus());
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2309,23 +2182,21 @@ static inline void __init check_timer(vo
}
if (pin1 != -1) {
- /*
- * Ok, does IRQ0 through the IOAPIC work?
- */
+ /* Ok, does IRQ0 through the IOAPIC work? */
if (no_pin1) {
- add_pin_to_irq_node(cfg, node, apic1, pin1);
- setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+ mp_alloc_timer_irq(apic1, pin1);
} else {
- /* for edge trigger, setup_ioapic_irq already
- * leave it unmasked.
+ /*
+ * for edge trigger, it's already unmasked,
* so only need to unmask if it is level-trigger
* do we really have level trigger timer?
*/
int idx;
idx = find_irq_entry(apic1, pin1, mp_INT);
if (idx != -1 && irq_trigger(idx))
- unmask_ioapic(cfg);
+ unmask_ioapic_irq(irq_get_chip_data(0));
}
+ irq_domain_activate_irq(irq_data);
if (timer_irq_works()) {
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
@@ -2345,8 +2216,8 @@ static inline void __init check_timer(vo
/*
* legacy devices should be connected to IO APIC #0
*/
- replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
- setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+ replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+ irq_domain_activate_irq(irq_data);
legacy_pic->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2426,36 +2297,35 @@ out:
static int mp_irqdomain_create(int ioapic)
{
- size_t size;
+ struct irq_alloc_info info;
+ struct irq_domain *parent;
int hwirqs = mp_ioapic_pin_count(ioapic);
struct ioapic *ip = &ioapics[ioapic];
struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
- size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic);
- ip->pin_info = kzalloc(size, GFP_KERNEL);
- if (!ip->pin_info)
- return -ENOMEM;
-
if (cfg->type == IOAPIC_DOMAIN_INVALID)
return 0;
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ info.ioapic_id = mpc_ioapic_id(ioapic);
+ parent = irq_remapping_get_ir_irq_domain(&info);
+ if (!parent)
+ parent = x86_vector_domain;
+
ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
(void *)(long)ioapic);
- if(!ip->irqdomain) {
- kfree(ip->pin_info);
- ip->pin_info = NULL;
+ if (!ip->irqdomain)
return -ENOMEM;
- }
+
+ ip->irqdomain->parent = parent;
if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
cfg->type == IOAPIC_DOMAIN_STRICT)
ioapic_dynirq_base = max(ioapic_dynirq_base,
gsi_cfg->gsi_end + 1);
- if (gsi_cfg->gsi_base == 0)
- irq_set_default_host(ip->irqdomain);
-
return 0;
}
@@ -2465,8 +2335,6 @@ static void ioapic_destroy_irqdomain(int
irq_domain_remove(ioapics[idx].irqdomain);
ioapics[idx].irqdomain = NULL;
}
- kfree(ioapics[idx].pin_info);
- ioapics[idx].pin_info = NULL;
}
void __init setup_IO_APIC(void)
@@ -2498,28 +2366,6 @@ void __init setup_IO_APIC(void)
ioapic_initialized = 1;
}
-/*
- * Called after all the initialization is done. If we didn't find any
- * APIC bugs then we can allow the modify fast path
- */
-
-static int __init io_apic_bug_finalize(void)
-{
- if (sis_apic_bug == -1)
- sis_apic_bug = 0;
-#ifdef CONFIG_X86_XEN
- if (is_initial_xendomain()) {
- struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
- op.u.platform_quirk.quirk_id = sis_apic_bug ?
- QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
- VOID(HYPERVISOR_platform_op(&op));
- }
-#endif
- return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
#ifndef CONFIG_XEN
static void resume_ioapic_id(int ioapic_idx)
{
@@ -2560,20 +2406,6 @@ static int __init ioapic_init_ops(void)
device_initcall(ioapic_init_ops);
#endif /* !CONFIG_XEN */
-static int
-io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
-{
- struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
- int ret;
-
- if (!cfg)
- return -EINVAL;
- ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
- if (!ret)
- setup_ioapic_irq(irq, cfg, attr);
- return ret;
-}
-
static int io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
@@ -2806,7 +2638,7 @@ void __init setup_ioapic_dest(void)
else
mask = apic->target_cpus();
- x86_io_apic_ops.set_affinity(idata, mask, false);
+ irq_set_affinity(irq, mask);
}
}
@@ -2851,7 +2683,7 @@ static struct resource * __init ioapic_s
return res;
}
-void __init native_io_apic_init_mappings(void)
+void __init io_apic_init_mappings(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
struct resource *ioapic_res;
@@ -3085,7 +2917,6 @@ int mp_unregister_ioapic(u32 gsi_base)
{
int ioapic, pin;
int found = 0;
- struct mp_pin_info *pin_info;
for_each_ioapic(ioapic)
if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
@@ -3098,11 +2929,17 @@ int mp_unregister_ioapic(u32 gsi_base)
}
for_each_pin(ioapic, pin) {
- pin_info = mp_pin_info(ioapic, pin);
- if (pin_info->count) {
- pr_warn("pin%d on IOAPIC%d is still in use.\n",
- pin, ioapic);
- return -EBUSY;
+ u32 gsi = mp_pin_to_gsi(ioapic, pin);
+ int irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+ struct mp_chip_data *data;
+
+ if (irq >= 0) {
+ data = irq_get_chip_data(irq);
+ if (data && data->count) {
+ pr_warn("pin%d on IOAPIC%d is still in use.\n",
+ pin, ioapic);
+ return -EBUSY;
+ }
}
}
@@ -3131,112 +2968,163 @@ int mp_ioapic_registered(u32 gsi_base)
return 0;
}
-static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
- int ioapic, int ioapic_pin,
- int trigger, int polarity)
+static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
+ struct irq_alloc_info *info)
{
- irq_attr->ioapic = ioapic;
- irq_attr->ioapic_pin = ioapic_pin;
- irq_attr->trigger = trigger;
- irq_attr->polarity = polarity;
+ if (info && info->ioapic_valid) {
+ data->trigger = info->ioapic_trigger;
+ data->polarity = info->ioapic_polarity;
+ } else if (acpi_get_override_irq(gsi, &data->trigger,
+ &data->polarity) < 0) {
+ /* PCI interrupts are always active low level triggered. */
+ data->trigger = IOAPIC_LEVEL;
+ data->polarity = IOAPIC_POL_LOW;
+ }
+}
+
+static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
+ struct IO_APIC_route_entry *entry)
+{
+ memset(entry, 0, sizeof(*entry));
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+#ifndef CONFIG_XEN /* meaningless on Xen */
+ entry->dest = cfg->dest_apicid;
+#endif
+ entry->vector = cfg->vector;
+ entry->trigger = data->trigger;
+ entry->polarity = data->polarity;
+ /*
+ * Mask level triggered irqs. Edge triggered irqs are masked
+ * by the irq core code in case they fire.
+ */
+ if (data->trigger == IOAPIC_LEVEL)
+ entry->mask = IOAPIC_MASKED;
+ else
+ entry->mask = IOAPIC_UNMASKED;
}
-int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq)
+int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
{
- int ioapic = (int)(long)domain->host_data;
- struct mp_pin_info *info = mp_pin_info(ioapic, hwirq);
- struct io_apic_irq_attr attr;
+ int ret, ioapic, pin;
+ struct irq_cfg *cfg;
+ struct irq_data *irq_data;
+ struct mp_chip_data *data;
+ struct irq_alloc_info *info = arg;
- /* Get default attribute if not set by caller yet */
- if (!info->set) {
- u32 gsi = mp_pin_to_gsi(ioapic, hwirq);
+ if (!info || nr_irqs > 1)
+ return -EINVAL;
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (!irq_data)
+ return -EINVAL;
- if (acpi_get_override_irq(gsi, &info->trigger,
- &info->polarity) < 0) {
- /*
- * PCI interrupts are always polarity one level
- * triggered.
- */
- info->trigger = 1;
- info->polarity = 1;
- }
- info->node = NUMA_NO_NODE;
+ ioapic = mp_irqdomain_ioapic_idx(domain);
+ pin = info->ioapic_pin;
+ if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+ return -EEXIST;
- /*
- * setup_IO_APIC_irqs() programs all legacy IRQs with default
- * trigger and polarity attributes. Don't set the flag for that
- * case so the first legacy IRQ user could reprogram the pin
- * with real trigger and polarity attributes.
- */
- if (virq >= nr_legacy_irqs() || info->count)
- info->set = 1;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ info->ioapic_entry = &data->entry;
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+ if (ret < 0) {
+ kfree(data);
+ return ret;
}
- set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger,
- info->polarity);
- return io_apic_setup_irq_pin(virq, info->node, &attr);
-}
+#ifndef CONFIG_XEN
+ INIT_LIST_HEAD(&data->irq_2_pin);
+#endif
+ irq_data->hwirq = info->ioapic_pin;
+#ifndef CONFIG_XEN
+ irq_data->chip = (domain->parent == x86_vector_domain) ?
+ &ioapic_chip : &ioapic_ir_chip;
+#endif
+ irq_data->chip_data = data;
+ mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
-void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq)
-{
+ cfg = irqd_cfg(irq_data);
+ add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+ if (info->ioapic_entry)
+ mp_setup_entry(cfg, data, info->ioapic_entry);
#ifndef CONFIG_XEN
- struct irq_data *data = irq_get_irq_data(virq);
- struct irq_cfg *cfg = irq_cfg(virq);
- int ioapic = (int)(long)domain->host_data;
- int pin = (int)data->hwirq;
+ mp_register_handler(virq, data->trigger);
+ if (virq < nr_legacy_irqs())
+ legacy_pic->mask(virq);
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
+ ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
+ virq, data->trigger, data->polarity, cfg->dest_apicid);
+#else
+ evtchn_register_pirq(virq, mp_pin_to_gsi(ioapic, pin));
- ioapic_mask_entry(ioapic, pin);
- __remove_pin_from_irq(cfg, ioapic, pin);
- WARN_ON(!list_empty(&cfg->irq_2_pin));
- arch_teardown_hwirq(virq);
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "IOAPIC[%d]: Set routing entry (%d-%d -> %#x -> IRQ %d Mode:%i Active:%i)\n",
+ ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
+ virq, data->trigger, data->polarity);
#endif
+
+ return 0;
}
-int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node)
+void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
{
- int ret = 0;
- int ioapic, pin;
- struct mp_pin_info *info;
+ struct irq_data *irq_data;
+ struct mp_chip_data *data;
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return -ENODEV;
-
- pin = mp_find_ioapic_pin(ioapic, gsi);
- info = mp_pin_info(ioapic, pin);
- trigger = trigger ? 1 : 0;
- polarity = polarity ? 1 : 0;
-
- mutex_lock(&ioapic_mutex);
- if (!info->set) {
- info->trigger = trigger;
- info->polarity = polarity;
- info->node = node;
- info->set = 1;
- } else if (info->trigger != trigger || info->polarity != polarity) {
- ret = -EBUSY;
+ BUG_ON(nr_irqs != 1);
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (irq_data && irq_data->chip_data) {
+ data = irq_data->chip_data;
+#ifndef CONFIG_XEN
+ __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain),
+ (int)irq_data->hwirq);
+ WARN_ON(!list_empty(&data->irq_2_pin));
+#endif
+ kfree(data);
}
- mutex_unlock(&ioapic_mutex);
-
- return ret;
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
-#ifdef CONFIG_X86_MRST
-/* Enable IOAPIC early just for system timer */
-void __init pre_init_apic_IRQ0(void)
+void mp_irqdomain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data)
{
- struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
+ struct mp_chip_data *data = irq_data->chip_data;
+#ifndef CONFIG_XEN
+ unsigned long flags;
+ struct irq_pin_list *entry;
- printk(KERN_INFO "Early APIC setup for system timer0\n");
-#ifndef CONFIG_SMP
- physid_set_mask_of_physid(boot_cpu_physical_apicid,
- &phys_cpu_present_map);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, data->irq_2_pin)
+ __ioapic_write_entry(entry->apic, entry->pin, data->entry);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+#else
+ ioapic_write_entry(mp_irqdomain_ioapic_idx(domain), irq_data->hwirq,
+ data->entry);
#endif
- setup_local_APIC();
+}
- io_apic_setup_irq_pin(0, 0, &attr);
- irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
- "edge");
+void mp_irqdomain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ /* It won't be called for IRQ with multiple IOAPIC pins associated */
+ ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain),
+ (int)irq_data->hwirq);
}
-#endif
+
+int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
+{
+ return (int)(long)domain->host_data;
+}
+
+const struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+ .alloc = mp_irqdomain_alloc,
+ .free = mp_irqdomain_free,
+ .activate = mp_irqdomain_activate,
+ .deactivate = mp_irqdomain_deactivate,
+};
--- a/arch/x86/kernel/apic/vector-xen.c
+++ b/arch/x86/kernel/apic/vector-xen.c
@@ -3,6 +3,8 @@
*
* Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
* Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ * Enable support of hierarchical irqdomains
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -11,8 +13,8 @@
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compiler.h>
-#include <linux/irqdomain.h>
#include <linux/slab.h>
+#include <asm/irqdomain.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
#include <asm/i8259.h>
@@ -20,7 +22,20 @@
#include <asm/irq_remapping.h>
#ifndef CONFIG_XEN
+struct apic_chip_data {
+ struct irq_cfg cfg;
+ cpumask_var_t domain;
+ cpumask_var_t old_domain;
+ u8 move_in_progress : 1;
+};
+
+struct irq_domain *x86_vector_domain;
static DEFINE_RAW_SPINLOCK(vector_lock);
+static cpumask_var_t vector_cpumask;
+static struct irq_chip lapic_controller;
+#ifdef CONFIG_X86_IO_APIC
+static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
+#endif
void lock_vector_lock(void)
{
@@ -34,74 +49,66 @@ void unlock_vector_lock(void)
{
raw_spin_unlock(&vector_lock);
}
-#endif /* CONFIG_XEN */
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data)
+#else
+struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
+#endif /* CONFIG_XEN */
{
- return irq_get_chip_data(irq);
+ if (!irq_data)
+ return NULL;
+
+ while (irq_data->parent_data)
+ irq_data = irq_data->parent_data;
+
+ return irq_data->chip_data;
}
+#ifndef CONFIG_XEN
struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
{
- return irq_data->chip_data;
+ struct apic_chip_data *data = apic_chip_data(irq_data);
+
+ return data ? &data->cfg : NULL;
+}
+#endif
+
+struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irqd_cfg(irq_get_irq_data(irq));
}
#ifndef CONFIG_XEN
-static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+static struct apic_chip_data *alloc_apic_chip_data(int node)
{
- struct irq_cfg *cfg;
+ struct apic_chip_data *data;
- cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
- if (!cfg)
+ data = kzalloc_node(sizeof(*data), GFP_KERNEL, node);
+ if (!data)
return NULL;
- if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
- goto out_cfg;
- if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+ if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node))
+ goto out_data;
+ if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
goto out_domain;
-#ifdef CONFIG_X86_IO_APIC
- INIT_LIST_HEAD(&cfg->irq_2_pin);
-#endif
- return cfg;
+ return data;
out_domain:
- free_cpumask_var(cfg->domain);
-out_cfg:
- kfree(cfg);
+ free_cpumask_var(data->domain);
+out_data:
+ kfree(data);
return NULL;
}
-struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+static void free_apic_chip_data(struct apic_chip_data *data)
{
- int res = irq_alloc_desc_at(at, node);
- struct irq_cfg *cfg;
-
- if (res < 0) {
- if (res != -EEXIST)
- return NULL;
- cfg = irq_cfg(at);
- if (cfg)
- return cfg;
+ if (data) {
+ free_cpumask_var(data->domain);
+ free_cpumask_var(data->old_domain);
+ kfree(data);
}
-
- cfg = alloc_irq_cfg(at, node);
- if (cfg)
- irq_set_chip_data(at, cfg);
- else
- irq_free_desc(at);
- return cfg;
}
-static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
-{
- if (!cfg)
- return;
- irq_set_chip_data(at, NULL);
- free_cpumask_var(cfg->domain);
- free_cpumask_var(cfg->old_domain);
- kfree(cfg);
-}
-
-static int
-__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int __assign_irq_vector(int irq, struct apic_chip_data *d,
+ const struct cpumask *mask)
{
/*
* NOTE! The local APIC isn't very good at handling
@@ -117,36 +124,33 @@ __assign_irq_vector(int irq, struct irq_
static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
static int current_offset = VECTOR_OFFSET_START % 16;
int cpu, err;
- cpumask_var_t tmp_mask;
- if (cfg->move_in_progress)
+ if (d->move_in_progress)
return -EBUSY;
- if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
- return -ENOMEM;
-
/* Only try and allocate irqs on cpus that are present */
err = -ENOSPC;
- cpumask_clear(cfg->old_domain);
+ cpumask_clear(d->old_domain);
cpu = cpumask_first_and(mask, cpu_online_mask);
while (cpu < nr_cpu_ids) {
int new_cpu, vector, offset;
- apic->vector_allocation_domain(cpu, tmp_mask, mask);
+ apic->vector_allocation_domain(cpu, vector_cpumask, mask);
- if (cpumask_subset(tmp_mask, cfg->domain)) {
+ if (cpumask_subset(vector_cpumask, d->domain)) {
err = 0;
- if (cpumask_equal(tmp_mask, cfg->domain))
+ if (cpumask_equal(vector_cpumask, d->domain))
break;
/*
* New cpumask using the vector is a proper subset of
* the current in use mask. So cleanup the vector
* allocation for the members that are not used anymore.
*/
- cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
- cfg->move_in_progress =
- cpumask_intersects(cfg->old_domain, cpu_online_mask);
- cpumask_and(cfg->domain, cfg->domain, tmp_mask);
+ cpumask_andnot(d->old_domain, d->domain,
+ vector_cpumask);
+ d->move_in_progress =
+ cpumask_intersects(d->old_domain, cpu_online_mask);
+ cpumask_and(d->domain, d->domain, vector_cpumask);
break;
}
@@ -160,16 +164,18 @@ next:
}
if (unlikely(current_vector == vector)) {
- cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
- cpumask_andnot(tmp_mask, mask, cfg->old_domain);
- cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
+ cpumask_or(d->old_domain, d->old_domain,
+ vector_cpumask);
+ cpumask_andnot(vector_cpumask, mask, d->old_domain);
+ cpu = cpumask_first_and(vector_cpumask,
+ cpu_online_mask);
continue;
}
if (test_bit(vector, used_vectors))
goto next;
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+ for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
if (per_cpu(vector_irq, new_cpu)[vector] >
VECTOR_UNDEFINED)
goto next;
@@ -177,57 +183,73 @@ next:
/* Found one! */
current_vector = vector;
current_offset = offset;
- if (cfg->vector) {
- cpumask_copy(cfg->old_domain, cfg->domain);
- cfg->move_in_progress =
- cpumask_intersects(cfg->old_domain, cpu_online_mask);
+ if (d->cfg.vector) {
+ cpumask_copy(d->old_domain, d->domain);
+ d->move_in_progress =
+ cpumask_intersects(d->old_domain, cpu_online_mask);
}
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
- cfg->vector = vector;
- cpumask_copy(cfg->domain, tmp_mask);
+ d->cfg.vector = vector;
+ cpumask_copy(d->domain, vector_cpumask);
err = 0;
break;
}
- free_cpumask_var(tmp_mask);
+
+ if (!err) {
+ /* cache destination APIC IDs into cfg->dest_apicid */
+ err = apic->cpu_mask_to_apicid_and(mask, d->domain,
+ &d->cfg.dest_apicid);
+ }
return err;
}
-int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int assign_irq_vector(int irq, struct apic_chip_data *data,
+ const struct cpumask *mask)
{
int err;
unsigned long flags;
raw_spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, cfg, mask);
+ err = __assign_irq_vector(irq, data, mask);
raw_spin_unlock_irqrestore(&vector_lock, flags);
return err;
}
-#endif /* CONFIG_XEN */
-void clear_irq_vector(int irq, struct irq_cfg *cfg)
+static int assign_irq_vector_policy(int irq, int node,
+ struct apic_chip_data *data,
+ struct irq_alloc_info *info)
+{
+ if (info && info->mask)
+ return assign_irq_vector(irq, data, info->mask);
+ if (node != NUMA_NO_NODE &&
+ assign_irq_vector(irq, data, cpumask_of_node(node)) == 0)
+ return 0;
+ return assign_irq_vector(irq, data, apic->target_cpus());
+}
+
+static void clear_irq_vector(int irq, struct apic_chip_data *data)
{
-#ifndef CONFIG_XEN
int cpu, vector;
unsigned long flags;
raw_spin_lock_irqsave(&vector_lock, flags);
- BUG_ON(!cfg->vector);
+ BUG_ON(!data->cfg.vector);
- vector = cfg->vector;
- for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+ vector = data->cfg.vector;
+ for_each_cpu_and(cpu, data->domain, cpu_online_mask)
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
- cfg->vector = 0;
- cpumask_clear(cfg->domain);
+ data->cfg.vector = 0;
+ cpumask_clear(data->domain);
- if (likely(!cfg->move_in_progress)) {
+ if (likely(!data->move_in_progress)) {
raw_spin_unlock_irqrestore(&vector_lock, flags);
return;
}
- for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+ for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
vector++) {
if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -236,12 +258,97 @@ void clear_irq_vector(int irq, struct ir
break;
}
}
- cfg->move_in_progress = 0;
+ data->move_in_progress = 0;
raw_spin_unlock_irqrestore(&vector_lock, flags);
+}
#endif /* CONFIG_XEN */
+
+void init_irq_alloc_info(struct irq_alloc_info *info,
+ const struct cpumask *mask)
+{
+ memset(info, 0, sizeof(*info));
+ info->mask = mask;
+}
+
+void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
+{
+ if (src)
+ *dst = *src;
+ else
+ memset(dst, 0, sizeof(*dst));
}
#ifndef CONFIG_XEN
+static void x86_vector_free_irqs(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
+ if (irq_data && irq_data->chip_data) {
+ clear_irq_vector(virq + i, irq_data->chip_data);
+ free_apic_chip_data(irq_data->chip_data);
+#ifdef CONFIG_X86_IO_APIC
+ if (virq + i < nr_legacy_irqs())
+ legacy_irq_data[virq + i] = NULL;
+#endif
+ irq_domain_reset_irq_data(irq_data);
+ }
+ }
+}
+
+static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_alloc_info *info = arg;
+ struct apic_chip_data *data;
+ struct irq_data *irq_data;
+ int i, err;
+
+ if (disable_apic)
+ return -ENXIO;
+
+ /* Currently vector allocator can't guarantee contiguous allocations */
+ if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
+ return -ENOSYS;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+ BUG_ON(!irq_data);
+#ifdef CONFIG_X86_IO_APIC
+ if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
+ data = legacy_irq_data[virq + i];
+ else
+#endif
+ data = alloc_apic_chip_data(irq_data->node);
+ if (!data) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ irq_data->chip = &lapic_controller;
+ irq_data->chip_data = data;
+ irq_data->hwirq = virq + i;
+ err = assign_irq_vector_policy(virq + i, irq_data->node, data,
+ info);
+ if (err)
+ goto error;
+ }
+
+ return 0;
+
+error:
+ x86_vector_free_irqs(domain, virq, i + 1);
+ return err;
+}
+
+static const struct irq_domain_ops x86_vector_domain_ops = {
+ .alloc = x86_vector_alloc_irqs,
+ .free = x86_vector_free_irqs,
+};
+
int __init arch_probe_nr_irqs(void)
{
int nr;
@@ -265,8 +372,43 @@ int __init arch_probe_nr_irqs(void)
return nr_legacy_irqs();
}
+#ifdef CONFIG_X86_IO_APIC
+static void init_legacy_irqs(void)
+{
+ int i, node = cpu_to_node(0);
+ struct apic_chip_data *data;
+
+ /*
+ * For legacy IRQ's, start with assigning irq0 to irq15 to
+ * ISA_IRQ_VECTOR(i) for all cpu's.
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ data = legacy_irq_data[i] = alloc_apic_chip_data(node);
+ BUG_ON(!data);
+
+ data->cfg.vector = ISA_IRQ_VECTOR(i);
+ cpumask_setall(data->domain);
+ irq_set_chip_data(i, data);
+ }
+}
+#else
+static void init_legacy_irqs(void) { }
+#endif
+
int __init arch_early_irq_init(void)
{
+ init_legacy_irqs();
+
+ x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops,
+ NULL);
+ BUG_ON(x86_vector_domain == NULL);
+ irq_set_default_host(x86_vector_domain);
+
+ arch_init_msi_domain(x86_vector_domain);
+ arch_init_htirq_domain(x86_vector_domain);
+
+ BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+
return arch_early_ioapic_init();
}
@@ -274,23 +416,17 @@ static void __setup_vector_irq(int cpu)
{
/* Initialize vector_irq on a new cpu */
int irq, vector;
- struct irq_cfg *cfg;
+ struct apic_chip_data *data;
- /*
- * vector_lock will make sure that we don't run into irq vector
- * assignments that might be happening on another cpu in parallel,
- * while we setup our initial vector to irq mappings.
- */
- raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
for_each_active_irq(irq) {
- cfg = irq_cfg(irq);
- if (!cfg)
+ data = apic_chip_data(irq_get_irq_data(irq));
+ if (!data)
continue;
- if (!cpumask_test_cpu(cpu, cfg->domain))
+ if (!cpumask_test_cpu(cpu, data->domain))
continue;
- vector = cfg->vector;
+ vector = data->cfg.vector;
per_cpu(vector_irq, cpu)[vector] = irq;
}
/* Mark the free vectors */
@@ -299,20 +435,20 @@ static void __setup_vector_irq(int cpu)
if (irq <= VECTOR_UNDEFINED)
continue;
- cfg = irq_cfg(irq);
- if (!cpumask_test_cpu(cpu, cfg->domain))
+ data = apic_chip_data(irq_get_irq_data(irq));
+ if (!cpumask_test_cpu(cpu, data->domain))
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
}
- raw_spin_unlock(&vector_lock);
}
/*
- * Setup the vector to irq mappings.
+ * Setup the vector to irq mappings. Must be called with vector_lock held.
*/
void setup_vector_irq(int cpu)
{
int irq;
+ lockdep_assert_held(&vector_lock);
/*
* On most of the platforms, legacy PIC delivers the interrupts on the
* boot cpu. But there are certain platforms where PIC interrupts are
@@ -321,20 +457,20 @@ void setup_vector_irq(int cpu)
* legacy vector to irq mapping:
*/
for (irq = 0; irq < nr_legacy_irqs(); irq++)
- per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
+ per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq;
__setup_vector_irq(cpu);
}
-int apic_retrigger_irq(struct irq_data *data)
+static int apic_retrigger_irq(struct irq_data *irq_data)
{
- struct irq_cfg *cfg = irqd_cfg(data);
+ struct apic_chip_data *data = apic_chip_data(irq_data);
unsigned long flags;
int cpu;
raw_spin_lock_irqsave(&vector_lock, flags);
- cpu = cpumask_first_and(cfg->domain, cpu_online_mask);
- apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
+ cpu = cpumask_first_and(data->domain, cpu_online_mask);
+ apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector);
raw_spin_unlock_irqrestore(&vector_lock, flags);
return 1;
@@ -347,73 +483,76 @@ void apic_ack_edge(struct irq_data *data
ack_APIC_irq();
}
-/*
- * Either sets data->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves data->affinity untouched.
- */
-int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- unsigned int *dest_id)
+static int apic_set_affinity(struct irq_data *irq_data,
+ const struct cpumask *dest, bool force)
{
- struct irq_cfg *cfg = irqd_cfg(data);
- unsigned int irq = data->irq;
- int err;
+ struct apic_chip_data *data = irq_data->chip_data;
+ int err, irq = irq_data->irq;
if (!config_enabled(CONFIG_SMP))
return -EPERM;
- if (!cpumask_intersects(mask, cpu_online_mask))
+ if (!cpumask_intersects(dest, cpu_online_mask))
return -EINVAL;
- err = assign_irq_vector(irq, cfg, mask);
- if (err)
- return err;
-
- err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
+ err = assign_irq_vector(irq, data, dest);
if (err) {
- if (assign_irq_vector(irq, cfg, data->affinity))
+ struct irq_data *top = irq_get_irq_data(irq);
+
+ if (assign_irq_vector(irq, data, top->affinity))
pr_err("Failed to recover vector for irq %d\n", irq);
return err;
}
- cpumask_copy(data->affinity, mask);
-
- return 0;
+ return IRQ_SET_MASK_OK;
}
+static struct irq_chip lapic_controller = {
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = apic_set_affinity,
+ .irq_retrigger = apic_retrigger_irq,
+};
+
#ifdef CONFIG_SMP
-void send_cleanup_vector(struct irq_cfg *cfg)
+static void __send_cleanup_vector(struct apic_chip_data *data)
{
cpumask_var_t cleanup_mask;
if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
unsigned int i;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ for_each_cpu_and(i, data->old_domain, cpu_online_mask)
apic->send_IPI_mask(cpumask_of(i),
IRQ_MOVE_CLEANUP_VECTOR);
} else {
- cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
free_cpumask_var(cleanup_mask);
}
- cfg->move_in_progress = 0;
+ data->move_in_progress = 0;
+}
+
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ struct apic_chip_data *data;
+
+ data = container_of(cfg, struct apic_chip_data, cfg);
+ if (data->move_in_progress)
+ __send_cleanup_vector(data);
}
asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
- ack_APIC_irq();
- irq_enter();
- exit_idle();
+ entering_ack_irq();
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
int irq;
unsigned int irr;
struct irq_desc *desc;
- struct irq_cfg *cfg;
+ struct apic_chip_data *data;
irq = __this_cpu_read(vector_irq[vector]);
@@ -424,8 +563,8 @@ asmlinkage __visible void smp_irq_move_c
if (!desc)
continue;
- cfg = irq_cfg(irq);
- if (!cfg)
+ data = apic_chip_data(&desc->irq_data);
+ if (!data)
continue;
raw_spin_lock(&desc->lock);
@@ -434,10 +573,11 @@ asmlinkage __visible void smp_irq_move_c
* Check if the irq migration is in progress. If so, we
* haven't received the cleanup request yet for this irq.
*/
- if (cfg->move_in_progress)
+ if (data->move_in_progress)
goto unlock;
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ if (vector == data->cfg.vector &&
+ cpumask_test_cpu(me, data->domain))
goto unlock;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -457,20 +597,21 @@ unlock:
raw_spin_unlock(&desc->lock);
}
- irq_exit();
+ exiting_irq();
}
static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
{
unsigned me;
+ struct apic_chip_data *data;
- if (likely(!cfg->move_in_progress))
+ data = container_of(cfg, struct apic_chip_data, cfg);
+ if (likely(!data->move_in_progress))
return;
me = smp_processor_id();
-
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
- send_cleanup_vector(cfg);
+ if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain))
+ __send_cleanup_vector(data);
}
void irq_complete_move(struct irq_cfg *cfg)
@@ -482,46 +623,11 @@ void irq_force_complete_move(int irq)
{
struct irq_cfg *cfg = irq_cfg(irq);
- if (!cfg)
- return;
-
- __irq_complete_move(cfg, cfg->vector);
+ if (cfg)
+ __irq_complete_move(cfg, cfg->vector);
}
#endif
-/*
- * Dynamic irq allocate and deallocation. Should be replaced by irq domains!
- */
-int arch_setup_hwirq(unsigned int irq, int node)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- int ret;
-
- cfg = alloc_irq_cfg(irq, node);
- if (!cfg)
- return -ENOMEM;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- ret = __assign_irq_vector(irq, cfg, apic->target_cpus());
- raw_spin_unlock_irqrestore(&vector_lock, flags);
-
- if (!ret)
- irq_set_chip_data(irq, cfg);
- else
- free_irq_cfg(irq, cfg);
- return ret;
-}
-
-void arch_teardown_hwirq(unsigned int irq)
-{
- struct irq_cfg *cfg = irq_cfg(irq);
-
- free_remapped_irq(irq);
- clear_irq_vector(irq, cfg);
- free_irq_cfg(irq, cfg);
-}
-
static void __init print_APIC_field(int base)
{
int i;
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -61,6 +61,8 @@ void foo(void)
#ifdef CONFIG_XEN
BLANK();
+ OFFSET(TI_cpu, thread_info, cpu);
+ BLANK();
OFFSET(XEN_START_mfn_list, start_info, mfn_list);
#endif
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -296,7 +296,7 @@ static int nearby_node(int apicid)
* Assumption: Number of cores in each internal node is the same.
* (2) AMD processors supporting compute units
*/
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
static void amd_get_topology(struct cpuinfo_x86 *c)
{
u32 cores_per_cu = 1;
@@ -349,7 +349,7 @@ static void amd_get_topology(struct cpui
*/
static void amd_detect_cmp(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
unsigned bits;
int cpu = smp_processor_id();
@@ -434,7 +434,7 @@ static void srat_detect_node(struct cpui
static void early_init_amd_mc(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
unsigned bits, ecx;
/* Multi core CPU? */
--- a/arch/x86/kernel/cpu/common-xen.c
+++ b/arch/x86/kernel/cpu/common-xen.c
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/string.h>
+#include <linux/ctype.h>
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/init.h>
@@ -31,8 +32,7 @@
#include <asm/setup.h>
#include <asm/apic.h>
#include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
#include <asm/mtrr.h>
#include <linux/numa.h>
#include <asm/asm.h>
@@ -162,32 +162,21 @@ EXPORT_PER_CPU_SYMBOL(xen_x86_cr0);
EXPORT_PER_CPU_SYMBOL(xen_x86_cr0_upd);
#endif
-static int __init x86_xsave_setup(char *s)
+static int __init x86_mpx_setup(char *s)
{
- if (s && *s)
+ /* require an exact match without trailing characters */
+ if (strlen(s))
return 0;
- setup_clear_cpu_cap(X86_FEATURE_XSAVE);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
- setup_clear_cpu_cap(X86_FEATURE_AVX);
- setup_clear_cpu_cap(X86_FEATURE_AVX2);
- return 1;
-}
-__setup("noxsave", x86_xsave_setup);
-static int __init x86_xsaveopt_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
- return 1;
-}
-__setup("noxsaveopt", x86_xsaveopt_setup);
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_MPX))
+ return 1;
-static int __init x86_xsaves_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
+ pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n");
return 1;
}
-__setup("noxsaves", x86_xsaves_setup);
+__setup("nompx", x86_mpx_setup);
#ifdef CONFIG_X86_32
static int cachesize_override = -1;
@@ -199,14 +188,6 @@ static int __init cachesize_setup(char *
}
__setup("cachesize=", cachesize_setup);
-static int __init x86_fxsr_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_FXSR);
- setup_clear_cpu_cap(X86_FEATURE_XMM);
- return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
-
static int __init x86_sep_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_SEP);
@@ -461,7 +442,7 @@ static const struct cpu_dev *cpu_devs[X8
static void get_model_name(struct cpuinfo_x86 *c)
{
unsigned int *v;
- char *p, *q;
+ char *p, *q, *s;
if (c->extended_cpuid_level < 0x80000004)
return;
@@ -472,19 +453,21 @@ static void get_model_name(struct cpuinf
cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
c->x86_model_id[48] = 0;
- /*
- * Intel chips right-justify this string for some dumb reason;
- * undo that brain damage:
- */
- p = q = &c->x86_model_id[0];
+ /* Trim whitespace */
+ p = q = s = &c->x86_model_id[0];
+
while (*p == ' ')
p++;
- if (p != q) {
- while (*p)
- *q++ = *p++;
- while (q <= &c->x86_model_id[48])
- *q++ = '\0'; /* Zero-pad the rest */
+
+ while (*p) {
+ /* Note the last non-whitespace index */
+ if (!isspace(*p))
+ s = q;
+
+ *q++ = *p++;
}
+
+ *(s + 1) = '\0';
}
void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
@@ -550,7 +533,7 @@ static void cpu_detect_tlb(struct cpuinf
void detect_ht(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
u32 eax, ebx, ecx, edx;
int index_msb, core_bits;
static bool printed;
@@ -804,11 +787,6 @@ static void __init early_identify_cpu(st
cpu_detect(c);
get_cpu_vendor(c);
get_cpu_cap(c);
- fpu_detect(c);
-#ifdef CONFIG_XEN
- if (!cpu_has_xsave)
- x86_xsave_setup(NULL);
-#endif
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
@@ -820,6 +798,7 @@ static void __init early_identify_cpu(st
this_cpu->c_bsp_init(c);
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+ fpu__init_system(c);
}
void __init early_cpu_init(void)
@@ -894,7 +873,7 @@ static void generic_identify(struct cpui
if (c->cpuid_level >= 0x00000001) {
c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
#ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
+# ifdef CONFIG_SMP
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
# else
c->apicid = c->initial_apicid;
@@ -1060,19 +1039,19 @@ static void identify_cpu(struct cpuinfo_
#ifdef CONFIG_X86_32
void enable_sep_cpu(void)
{
- extern asmlinkage void ia32pv_sysenter_target(void);
+ extern asmlinkage void entry_SYSENTER_PV32(void);
static struct callback_register sysenter = {
.type = CALLBACKTYPE_sysenter,
- .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
+ .address = { __KERNEL_CS, (unsigned long)entry_SYSENTER_PV32 },
};
# ifdef TIF_CSTAR
if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
- extern asmlinkage void ia32pv_cstar_target(void);
+ extern asmlinkage void entry_SYSCALL_PV32(void);
static const struct callback_register cstar = {
.type = CALLBACKTYPE_syscall32,
.address = { __KERNEL_CS,
- (unsigned long)ia32pv_cstar_target },
+ (unsigned long)entry_SYSCALL_PV32 },
};
if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
@@ -1085,7 +1064,7 @@ void enable_sep_cpu(void)
return;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
- sysenter.address.eip = (unsigned long)ia32_sysenter_target;
+ sysenter.address.eip = (unsigned long)entry_SYSENTER_32;
switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
case 0:
@@ -1196,7 +1175,7 @@ void print_cpu_info(struct cpuinfo_x86 *
printk(KERN_CONT "%s ", vendor);
if (c->x86_model_id[0])
- printk(KERN_CONT "%s", strim(c->x86_model_id));
+ printk(KERN_CONT "%s", c->x86_model_id);
else
printk(KERN_CONT "%d86", c->x86);
@@ -1229,10 +1208,6 @@ static __init int setup_disablecpuid(cha
}
__setup("clearcpuid=", setup_disablecpuid);
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
- (unsigned long)&init_thread_union + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
-
#ifdef CONFIG_X86_64
#ifndef CONFIG_X86_NO_IDT
struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
@@ -1267,8 +1242,6 @@ DEFINE_PER_CPU(unsigned int, irq_count)
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
-
#ifndef CONFIG_X86_NO_TSS
/*
* Special IST stacks which the CPU switches to when it calls
@@ -1294,10 +1267,10 @@ void syscall_init(void)
* set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
*/
wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
- wrmsrl(MSR_LSTAR, system_call);
+ wrmsrl(MSR_LSTAR, entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
- wrmsrl(MSR_CSTAR, ia32_cstar_target);
+ wrmsrl(MSR_CSTAR, entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1306,7 +1279,7 @@ void syscall_init(void)
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, ignore_sysret);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
@@ -1322,11 +1295,11 @@ void syscall_init(void)
#ifdef CONFIG_IA32_EMULATION
static const struct callback_register cstar = {
.type = CALLBACKTYPE_syscall32,
- .address = (unsigned long)ia32_cstar_target
+ .address = (unsigned long)entry_SYSCALL_compat
};
static const struct callback_register sysenter = {
.type = CALLBACKTYPE_sysenter,
- .address = (unsigned long)ia32_sysenter_target
+ .address = (unsigned long)entry_SYSENTER_compat
};
if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
@@ -1393,7 +1366,6 @@ DEFINE_PER_CPU(struct task_struct *, cur
EXPORT_PER_CPU_SYMBOL(current_task);
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
/*
* On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
@@ -1571,12 +1543,12 @@ void cpu_init(void)
set_tss_desc(cpu, t);
load_TR_desc();
#endif
- load_LDT(&init_mm.context);
+ load_mm_ldt(&init_mm);
clear_all_debug_regs();
dbg_restore_debug_regs();
- fpu_init();
+ fpu__init_cpu();
#ifdef CONFIG_X86_LOCAL_APIC
if (is_uv_system())
@@ -1622,7 +1594,7 @@ void cpu_init(void)
load_sp0(t, thread);
- load_LDT(&init_mm.context);
+ load_mm_ldt(&init_mm);
#ifndef CONFIG_X86_NO_TSS
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
@@ -1636,7 +1608,7 @@ void cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();
- fpu_init();
+ fpu__init_cpu();
}
#endif
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -654,7 +654,7 @@ unsigned int init_intel_cacheinfo(struct
unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
unsigned int cpu = c->cpu_index;
#endif
@@ -689,7 +689,7 @@ unsigned int init_intel_cacheinfo(struct
break;
case 2:
new_l2 = this_leaf.size/1024;
-#ifdef CONFIG_X86_HT
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
l2_id = c->apicid & ~((1 << index_msb) - 1);
@@ -697,7 +697,7 @@ unsigned int init_intel_cacheinfo(struct
break;
case 3:
new_l3 = this_leaf.size/1024;
-#ifdef CONFIG_X86_HT
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
l3_id = c->apicid & ~((1 << index_msb) - 1);
@@ -778,19 +778,19 @@ unsigned int init_intel_cacheinfo(struct
if (new_l2) {
l2 = new_l2;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
per_cpu(cpu_llc_id, cpu) = l2_id;
#endif
}
if (new_l3) {
l3 = new_l3;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
per_cpu(cpu_llc_id, cpu) = l3_id;
#endif
}
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
/*
* If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
* turns means that the only possibility is SMT (as indicated in
--- a/arch/x86/kernel/cpu/microcode/core-xen.c
+++ b/arch/x86/kernel/cpu/microcode/core-xen.c
@@ -1,25 +1,16 @@
/*
- * CPU Microcode Update Driver for Linux on Xen
+ * CPU Microcode Update Driver for Linux on Xen
*
- * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- * 2006 Shaohua Li <shaohua.li@intel.com>
- *
- * This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- * Software Developer's Manual
- * Order Number 253668 or free download from:
- *
- * http://developer.intel.com/Assets/PDF/manual/253668.pdf
- *
- * For more information, go to http://www.urbanmyth.org/microcode
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ * 2013-2015 Borislav Petkov <bp@alien8.de>
+ *
+ * This driver allows to upgrade microcode on x86 processors.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
--- a/arch/x86/kernel/cpu/mtrr/main-xen.c
+++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
@@ -36,6 +36,13 @@ const struct mtrr_ops generic_mtrr_ops =
const struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
unsigned int num_var_ranges;
+static bool __mtrr_enabled;
+
+static bool mtrr_enabled(void)
+{
+ return __mtrr_enabled;
+}
+
unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static u64 tom2;
@@ -67,6 +74,9 @@ int mtrr_add_page(unsigned long base, un
int error;
struct xen_platform_op op;
+ if (!mtrr_enabled())
+ return -ENXIO;
+
mutex_lock(&mtrr_mutex);
op.cmd = XENPF_add_memtype;
@@ -102,6 +112,8 @@ static int mtrr_check(unsigned long base
int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
bool increment)
{
+ if (!mtrr_enabled())
+ return -ENODEV;
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
@@ -117,6 +129,9 @@ int mtrr_del_page(int reg, unsigned long
int error = -EINVAL;
struct xen_platform_op op;
+ if (!mtrr_enabled())
+ return -ENODEV;
+
mutex_lock(&mtrr_mutex);
if (reg < 0) {
@@ -156,6 +171,8 @@ int mtrr_del_page(int reg, unsigned long
int mtrr_del(int reg, unsigned long base, unsigned long size)
{
+ if (!mtrr_enabled())
+ return -ENODEV;
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
@@ -171,6 +188,9 @@ EXPORT_SYMBOL(mtrr_del);
* attempts to add a WC MTRR covering size bytes starting at base and
* logs an error if this fails.
*
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
* Drivers must store the return value to pass to mtrr_del_wc_if_needed,
* but drivers should not try to interpret that return value.
*/
@@ -178,7 +198,7 @@ int arch_phys_wc_add(unsigned long base,
{
int ret;
- if (pat_enabled)
+ if (pat_enabled() || !mtrr_enabled())
return 0; /* Success! (We don't need to do anything.) */
ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
@@ -210,7 +230,7 @@ void arch_phys_wc_del(int handle)
EXPORT_SYMBOL(arch_phys_wc_del);
/*
- * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
* @handle: Return value from arch_phys_wc_add
*
* This will turn the return value from arch_phys_wc_add into an mtrr
@@ -220,33 +240,40 @@ EXPORT_SYMBOL(arch_phys_wc_del);
* in printk line. Alas there is an illegitimate use in some ancient
* drm ioctls.
*/
-int phys_wc_to_mtrr_index(int handle)
+int arch_phys_wc_index(int handle)
{
if (handle < MTRR_TO_PHYS_WC_OFFSET)
return -1;
else
return handle - MTRR_TO_PHYS_WC_OFFSET;
}
-EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
-/*
- * Returns the effective MTRR type for the region
- * Error returns:
- * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
- * - 0xFF - when MTRR is not enabled
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ *
+ * Output Argument:
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ * region is fully covered by a single MTRR entry or the default
+ * type.
*/
-u8 mtrr_type_lookup(u64 start, u64 end)
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
{
int i, error;
u64 start_mfn, end_mfn, base_mfn, top_mfn;
u8 prev_match, curr_match;
struct xen_platform_op op;
+ *uniform = 1;
if (!is_initial_xendomain())
return MTRR_TYPE_WRBACK;
if (!num_var_ranges)
- return 0xFF;
+ return MTRR_TYPE_INVALID;
start_mfn = start >> PAGE_SHIFT;
/* Make end inclusive end, instead of exclusive */
@@ -261,6 +288,7 @@ u8 mtrr_type_lookup(u64 start, u64 end)
if (!error)
return op.u.read_memtype.type;
#endif
+ *uniform = 0;
return MTRR_TYPE_UNCACHABLE;
}
@@ -269,7 +297,7 @@ u8 mtrr_type_lookup(u64 start, u64 end)
* Look of multiple ranges matching this address and pick type
* as per MTRR precedence
*/
- prev_match = 0xFF;
+ prev_match = MTRR_TYPE_INVALID;
for (i = 0; i < num_var_ranges; ++i) {
op.cmd = XENPF_read_memtype;
op.u.read_memtype.reg = i;
@@ -285,12 +313,11 @@ u8 mtrr_type_lookup(u64 start, u64 end)
continue;
}
- if (base_mfn > start_mfn || end_mfn > top_mfn) {
- return 0xFE;
- }
+ if (base_mfn > start_mfn || end_mfn > top_mfn)
+ *uniform = 0;
curr_match = op.u.read_memtype.type;
- if (prev_match == 0xFF) {
+ if (prev_match == MTRR_TYPE_INVALID) {
prev_match = curr_match;
continue;
}
@@ -318,7 +345,7 @@ u8 mtrr_type_lookup(u64 start, u64 end)
return MTRR_TYPE_WRBACK;
}
- if (prev_match != 0xFF)
+ if (prev_match != MTRR_TYPE_INVALID)
return prev_match;
#if 0//todo
@@ -369,6 +396,9 @@ void __init mtrr_bp_init(void)
rdmsrl(MSR_K8_TOP_MEM2, tom2);
tom2 &= 0xffffff8000000ULL;
}
+
+ if (!mtrr_enabled())
+ pr_info("MTRR: Disabled\n");
}
void mtrr_ap_init(void)
--- a/arch/x86/kernel/e820-xen.c
+++ b/arch/x86/kernel/e820-xen.c
@@ -172,6 +172,7 @@ static void __init e820_print_type(u32 t
case E820_UNUSABLE:
printk(KERN_CONT "unusable");
break;
+ case E820_PMEM:
case E820_PRAM:
printk(KERN_CONT "persistent (type %u)", type);
break;
@@ -1030,11 +1031,32 @@ static inline const char *e820_type_to_s
case E820_ACPI: return "ACPI Tables";
case E820_NVS: return "ACPI Non-volatile Storage";
case E820_UNUSABLE: return "Unusable memory";
- case E820_PRAM: return "Persistent RAM";
+ case E820_PRAM: return "Persistent Memory (legacy)";
+ case E820_PMEM: return "Persistent Memory";
default: return "reserved";
}
}
+static bool do_mark_busy(u32 type, struct resource *res)
+{
+ /* this is the legacy bios/dos rom-shadow + mmio region */
+ if (res->start < (1ULL<<20))
+ return true;
+
+ /*
+ * Treat persistent memory like device memory, i.e. reserve it
+ * for exclusive use of a driver
+ */
+ switch (type) {
+ case E820_RESERVED:
+ case E820_PRAM:
+ case E820_PMEM:
+ return false;
+ default:
+ return true;
+ }
+}
+
#ifdef CONFIG_XEN
#define e820 machine_e820
#endif
@@ -1068,9 +1090,7 @@ void __init e820_reserve_resources(void)
* pci device BAR resource and insert them later in
* pcibios_resource_survey()
*/
- if (((e820.map[i].type != E820_RESERVED) &&
- (e820.map[i].type != E820_PRAM)) ||
- res->start < (1ULL<<20)) {
+ if (do_mark_busy(e820.map[i].type, res)) {
res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
@@ -1252,7 +1272,8 @@ void __init memblock_find_dma_reserve(vo
nr_pages += end_pfn - start_pfn;
}
- for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
+ for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL) {
start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
if (start_pfn < end_pfn)
--- a/arch/x86/kernel/early_printk-xen.c
+++ b/arch/x86/kernel/early_printk-xen.c
@@ -174,7 +174,9 @@ static __init void early_serial_init(cha
}
if (*s) {
- if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+ baud = simple_strtoull(s, &e, 0);
+
+ if (baud == 0 || s == e)
baud = DEFAULT_BAUD;
}
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -6,6 +6,8 @@
#include <linux/sched.h>
+static int x86_noxsave_setup(char *);
+
/*
* Initialize the TS bit in CR0 according to the style of context-switches
* we are using:
@@ -64,7 +66,7 @@ void fpu__init_cpu(void)
* Set the X86_FEATURE_FPU CPU-capability bit based on
* trying to execute an actual sequence of FPU instructions:
*/
-static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+static void __init fpu__init_system_early_generic(struct cpuinfo_x86 *c)
{
unsigned long cr0;
u16 fsw, fcw;
@@ -90,6 +92,10 @@ static void fpu__init_system_early_gener
asm volatile("hlt");
}
#endif
+#ifdef CONFIG_XEN
+ if (!cpu_has_xsave)
+ x86_noxsave_setup("");
+#endif
}
/*
--- a/arch/x86/kernel/head-xen.c
+++ b/arch/x86/kernel/head-xen.c
@@ -196,7 +196,7 @@ void __init xen_arch_setup(void)
#ifdef CONFIG_X86_64
static const struct callback_register __initconst syscall = {
.type = CALLBACKTYPE_syscall,
- .address = CALLBACK_ADDR(system_call)
+ .address = CALLBACK_ADDR(entry_SYSCALL_64)
};
#endif
static const struct callback_register __initconst nmi_cb = {
--- a/arch/x86/kernel/head64-xen.c
+++ b/arch/x86/kernel/head64-xen.c
@@ -178,11 +178,12 @@ asmlinkage __visible void __init x86_64_
/* Kill off the identity-map trampoline */
reset_early_page_tables();
- kasan_map_early_shadow(early_level4_pgt);
-
- /* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
+ clear_page(init_level4_pgt);
+
+ kasan_early_init();
+
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
set_intr_gate(i, early_idt_handler_array[i]);
load_idt((const struct desc_ptr *)&idt_descr);
@@ -196,15 +197,12 @@ asmlinkage __visible void __init x86_64_
#endif
#ifndef CONFIG_XEN
- clear_page(init_level4_pgt);
/* set init_level4_pgt kernel high mapping*/
init_level4_pgt[511] = early_level4_pgt[511];
- kasan_map_early_shadow(init_level4_pgt);
#else
xen_switch_pt();
#endif
-
x86_64_start_reservations(real_mode_data);
}
--- a/arch/x86/kernel/head_32-xen.S
+++ b/arch/x86/kernel/head_32-xen.S
@@ -12,7 +12,6 @@
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
#include <asm/boot.h>
-#include <asm/dwarf2.h>
#include <asm/percpu.h>
#include <xen/interface/xen.h>
#include <xen/interface/arch-x86/xen-mca.h>
@@ -119,15 +118,7 @@ ENTRY(startup_32)
.balign PAGE_SIZE
#endif
ENTRY(hypercall_page)
- CFI_STARTPROC
- .skip __HYPERVISOR_iret * 32
- CFI_REMEMBER_STATE
- .skip 1 /* push %eax */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET eax,0
- CFI_RESTORE_STATE
- .balign 0x1000,0
- CFI_ENDPROC
+ .fill PAGE_SIZE, 1, 0xcc
#define HYPERCALL(n) \
.equ HYPERVISOR_##n, hypercall_page + __HYPERVISOR_##n * 32; \
--- a/arch/x86/kernel/head_64-xen.S
+++ b/arch/x86/kernel/head_64-xen.S
@@ -19,7 +19,6 @@
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/cache.h>
-#include <asm/dwarf2.h>
#include <asm/percpu.h>
#include <xen/interface/xen.h>
#include <xen/interface/arch-x86/xen-mca.h>
@@ -95,39 +94,7 @@ NEXT_PAGE(hypercall_page)
#if CONFIG_XEN_COMPAT <= 0x030002
phys_hypercall_page = . - .head.text
#endif
- CFI_STARTPROC
- i = 0
- .rept 0x1000 / 0x20
- .skip 1 /* push %rcx */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rcx,0
- .skip 2 /* push %r11 */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET r11,0
- .if i == __HYPERVISOR_iret
- .skip 1 /* push %rax */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rax,0
- .endif
- .skip 5 /* mov $#,%eax */
- .skip 2 /* syscall */
- .if i == __HYPERVISOR_iret
- CFI_ADJUST_CFA_OFFSET -3*8
- CFI_SAME_VALUE rax
- CFI_SAME_VALUE r11
- CFI_SAME_VALUE rcx
- .else
- .skip 2 /* pop %r11 */
- CFI_ADJUST_CFA_OFFSET -8
- CFI_RESTORE r11
- .skip 1 /* pop %rcx */
- CFI_ADJUST_CFA_OFFSET -8
- CFI_RESTORE rcx
- .endif
- .balign 0x20,0 /* ret */
- i = i + 1
- .endr
- CFI_ENDPROC
+ .fill PAGE_SIZE, 1, 0xcc
#define HYPERCALL(n) \
.equ HYPERVISOR_##n, hypercall_page + __HYPERVISOR_##n * 32; \
--- a/arch/x86/kernel/irq-xen.c
+++ b/arch/x86/kernel/irq-xen.c
@@ -22,6 +22,12 @@
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
atomic_t irq_err_count;
#ifndef CONFIG_XEN
@@ -130,6 +136,12 @@ int arch_show_interrupts(struct seq_file
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_puts(p, " Threshold APIC interrupts\n");
#endif
+#ifdef CONFIG_X86_MCE_AMD
+ seq_printf(p, "%*s: ", prec, "DFR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count);
+ seq_puts(p, " Deferred Error APIC interrupts\n");
+#endif
#ifdef CONFIG_X86_MCE
seq_printf(p, "%*s: ", prec, "MCE");
for_each_online_cpu(j)
@@ -150,6 +162,18 @@ int arch_show_interrupts(struct seq_file
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
+#ifdef CONFIG_HAVE_KVM
+ seq_printf(p, "%*s: ", prec, "PIN");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
+ seq_puts(p, " Posted-interrupt notification event\n");
+
+ seq_printf(p, "%*s: ", prec, "PIW");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->kvm_posted_intr_wakeup_ipis);
+ seq_puts(p, " Posted-interrupt wakeup event\n");
+#endif
return 0;
}
@@ -211,8 +235,7 @@ __visible unsigned int __irq_entry do_IR
unsigned vector = ~regs->orig_ax;
unsigned irq;
- irq_enter();
- exit_idle();
+ entering_irq();
irq = __this_cpu_read(vector_irq[vector]);
@@ -228,7 +251,7 @@ __visible unsigned int __irq_entry do_IR
}
}
- irq_exit();
+ exiting_irq();
set_irq_regs(old_regs);
return 1;
@@ -256,6 +279,18 @@ __visible void smp_x86_platform_ipi(stru
}
#ifdef CONFIG_HAVE_KVM
+static void dummy_handler(void) {}
+static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
+
+void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
+{
+ if (handler)
+ kvm_posted_intr_wakeup_handler = handler;
+ else
+ kvm_posted_intr_wakeup_handler = dummy_handler;
+}
+EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+
/*
* Handler for POSTED_INTERRUPT_VECTOR.
*/
@@ -263,16 +298,23 @@ __visible void smp_kvm_posted_intr_ipi(s
{
struct pt_regs *old_regs = set_irq_regs(regs);
- ack_APIC_irq();
-
- irq_enter();
-
- exit_idle();
-
+ entering_ack_irq();
inc_irq_stat(kvm_posted_intr_ipis);
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
- irq_exit();
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ entering_ack_irq();
+ inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+ kvm_posted_intr_wakeup_handler();
+ exiting_irq();
set_irq_regs(old_regs);
}
#endif
@@ -327,14 +369,22 @@ int check_irq_vectors_for_cpu_disable(vo
if (!desc)
continue;
+ /*
+ * Protect against concurrent action removal,
+ * affinity changes etc.
+ */
+ raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
cpumask_copy(&affinity_new, data->affinity);
cpumask_clear_cpu(this_cpu, &affinity_new);
/* Do not count inactive or per-cpu irqs. */
- if (!irq_has_action(irq) || irqd_is_per_cpu(data))
+ if (!irq_has_action(irq) || irqd_is_per_cpu(data)) {
+ raw_spin_unlock(&desc->lock);
continue;
+ }
+ raw_spin_unlock(&desc->lock);
/*
* A single irq may be mapped to multiple
* cpu's vector_irq[] (for example IOAPIC cluster
@@ -365,6 +415,9 @@ int check_irq_vectors_for_cpu_disable(vo
* vector. If the vector is marked in the used vectors
* bitmap or an irq is assigned to it, we don't count
* it as available.
+ *
+ * As this is an inaccurate snapshot anyway, we can do
+ * this w/o holding vector_lock.
*/
for (vector = FIRST_EXTERNAL_VECTOR;
vector < first_system_vector; vector++) {
@@ -462,15 +515,20 @@ void fixup_irqs(void)
*/
mdelay(1);
+ /*
+ * We can walk the vector array of this cpu without holding
+ * vector_lock because the cpu is already marked !online, so
+ * nothing else will touch it.
+ */
for_each_irq_desc(irq, desc) {
if (!__test_and_clear_bit(irq, irqs_used))
continue;
if (xen_test_irq_pending(irq)) {
desc = irq_to_desc(irq);
+ raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
chip = irq_data_get_irq_chip(data);
- raw_spin_lock(&desc->lock);
if (chip->irq_retrigger)
chip->irq_retrigger(data);
raw_spin_unlock(&desc->lock);
--- a/arch/x86/kernel/ldt-xen.c
+++ b/arch/x86/kernel/ldt-xen.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
@@ -20,85 +21,83 @@
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
-#ifdef CONFIG_SMP
+/* context.lock is held for us, so we don't need any locking. */
static void flush_ldt(void *current_mm)
{
- if (current->active_mm == current_mm)
- load_LDT(¤t->active_mm->context);
+ mm_context_t *pc;
+
+ if (current->active_mm != current_mm)
+ return;
+
+ pc = ¤t->active_mm->context;
+ set_ldt(pc->ldt->entries, pc->ldt->size);
}
-#endif
-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+static struct ldt_struct *alloc_ldt_struct(int size)
{
- void *oldldt, *newldt;
- int oldsize;
+ struct ldt_struct *new_ldt;
+ int alloc_size;
- if (mincount <= pc->size)
- return 0;
- oldsize = pc->size;
- mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
- (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
- if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
- newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
- else
- newldt = (void *)__get_free_page(GFP_KERNEL);
+ if (size > LDT_ENTRIES)
+ return NULL;
- if (!newldt)
- return -ENOMEM;
+ new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
+ if (!new_ldt)
+ return NULL;
+
+ BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
+ alloc_size = size * LDT_ENTRY_SIZE;
+
+ /*
+ * Xen is very picky: it requires a page-aligned LDT that has no
+ * trailing nonzero bytes in any page that contains LDT descriptors.
+ * Keep it simple: zero the whole allocation and never allocate less
+ * than PAGE_SIZE.
+ */
+ if (alloc_size > PAGE_SIZE)
+ new_ldt->entries = vzalloc(alloc_size);
+ else
+ new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
- if (oldsize)
- memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
- oldldt = pc->ldt;
- memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
- (mincount - oldsize) * LDT_ENTRY_SIZE);
-
-#ifdef CONFIG_X86_64
- /* CHECKME: Do we really need this ? */
- wmb();
-#endif
- pc->ldt = newldt;
- wmb();
- pc->size = mincount;
- wmb();
-
- if (reload) {
-#ifdef CONFIG_SMP
- preempt_disable();
-#endif
- make_pages_readonly(newldt,
- (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
- XENFEAT_writable_descriptor_tables);
- load_LDT(pc);
-#ifdef CONFIG_SMP
- if (!cpumask_equal(mm_cpumask(current->mm),
- cpumask_of(smp_processor_id())))
- smp_call_function(flush_ldt, current->mm, 1);
- preempt_enable();
-#endif
+ if (!new_ldt->entries) {
+ kfree(new_ldt);
+ return NULL;
}
- if (oldsize) {
- make_pages_writable(oldldt,
- (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
- XENFEAT_writable_descriptor_tables);
- if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(oldldt);
- else
- put_page(virt_to_page(oldldt));
- }
- return 0;
-}
-
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
-{
- int err = alloc_ldt(new, old->size, 0);
-
- if (err < 0)
- return err;
- memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
- make_pages_readonly(new->ldt,
- (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+ new_ldt->size = size;
+ return new_ldt;
+}
+
+/* After calling this, the LDT is immutable. */
+static void finalize_ldt_struct(struct ldt_struct *ldt)
+{
+ make_pages_readonly(ldt->entries, PFN_DOWN(ldt->size * LDT_ENTRY_SIZE),
+ XENFEAT_writable_descriptor_tables);
+}
+
+/* context.lock is held */
+static void install_ldt(struct mm_struct *current_mm,
+ struct ldt_struct *ldt)
+{
+ /* Synchronizes with lockless_dereference in load_mm_ldt. */
+ smp_store_release(¤t_mm->context.ldt, ldt);
+
+ /* Activate the LDT for all CPUs using current_mm. */
+ on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
+}
+
+static void free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (likely(!ldt))
+ return;
+
+ make_pages_writable(ldt->entries, PFN_DOWN(ldt->size * LDT_ENTRY_SIZE),
XENFEAT_writable_descriptor_tables);
- return 0;
+ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(ldt->entries);
+ else
+ free_page((unsigned long)ldt->entries);
+ kfree(ldt);
}
/*
@@ -107,19 +106,35 @@ static inline int copy_ldt(mm_context_t
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
{
+ struct ldt_struct *new_ldt;
struct mm_struct *old_mm;
int retval = 0;
memset(&mm->context, 0, sizeof(mm->context));
mutex_init(&mm->context.lock);
old_mm = current->mm;
- if (old_mm)
- mm->context.vdso = old_mm->context.vdso;
- if (old_mm && old_mm->context.size > 0) {
- mutex_lock(&old_mm->context.lock);
- retval = copy_ldt(&mm->context, &old_mm->context);
- mutex_unlock(&old_mm->context.lock);
+ if (!old_mm)
+ return 0;
+ mm->context.vdso = old_mm->context.vdso;
+
+ mutex_lock(&old_mm->context.lock);
+ if (!old_mm->context.ldt)
+ goto out_unlock;
+
+ new_ldt = alloc_ldt_struct(old_mm->context.ldt->size);
+ if (!new_ldt) {
+ retval = -ENOMEM;
+ goto out_unlock;
}
+
+ memcpy(new_ldt->entries, old_mm->context.ldt->entries,
+ new_ldt->size * LDT_ENTRY_SIZE);
+ finalize_ldt_struct(new_ldt);
+
+ mm->context.ldt = new_ldt;
+
+out_unlock:
+ mutex_unlock(&old_mm->context.lock);
return retval;
}
@@ -130,53 +145,47 @@ int init_new_context(struct task_struct
*/
void destroy_context(struct mm_struct *mm)
{
- if (mm->context.size) {
- /* CHECKME: Can this ever happen ? */
- if (mm == current->active_mm)
- clear_LDT();
- make_pages_writable(mm->context.ldt,
- (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
- XENFEAT_writable_descriptor_tables);
- if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(mm->context.ldt);
- else
- put_page(virt_to_page(mm->context.ldt));
- mm->context.size = 0;
- }
+ free_ldt_struct(mm->context.ldt);
+ mm->context.ldt = NULL;
}
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
- int err;
+ int retval;
unsigned long size;
struct mm_struct *mm = current->mm;
- if (!mm->context.size)
- return 0;
+ mutex_lock(&mm->context.lock);
+
+ if (!mm->context.ldt) {
+ retval = 0;
+ goto out_unlock;
+ }
+
if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
- mutex_lock(&mm->context.lock);
- size = mm->context.size * LDT_ENTRY_SIZE;
+ size = mm->context.ldt->size * LDT_ENTRY_SIZE;
if (size > bytecount)
size = bytecount;
- err = 0;
- if (copy_to_user(ptr, mm->context.ldt, size))
- err = -EFAULT;
- mutex_unlock(&mm->context.lock);
- if (err < 0)
- goto error_return;
+ if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
+ retval = -EFAULT;
+ goto out_unlock;
+ }
+
if (size != bytecount) {
- /* zero-fill the rest */
- if (clear_user(ptr + size, bytecount - size) != 0) {
- err = -EFAULT;
- goto error_return;
+ /* Zero-fill the rest and pretend we read bytecount bytes. */
+ if (clear_user(ptr + size, bytecount - size)) {
+ retval = -EFAULT;
+ goto out_unlock;
}
}
- return bytecount;
-error_return:
- return err;
+ retval = bytecount;
+
+out_unlock:
+ mutex_unlock(&mm->context.lock);
+ return retval;
}
static int read_default_ldt(void __user *ptr, unsigned long bytecount)
@@ -200,6 +209,8 @@ static int write_ldt(void __user *ptr, u
struct desc_struct ldt;
int error;
struct user_desc ldt_info;
+ int oldsize, newsize;
+ struct ldt_struct *new_ldt, *old_ldt;
error = -EINVAL;
if (bytecount != sizeof(ldt_info))
@@ -218,34 +229,40 @@ static int write_ldt(void __user *ptr, u
goto out;
}
- mutex_lock(&mm->context.lock);
- if (ldt_info.entry_number >= mm->context.size) {
- error = alloc_ldt(¤t->mm->context,
- ldt_info.entry_number + 1, 1);
- if (error < 0)
- goto out_unlock;
- }
-
- /* Allow LDTs to be cleared by the user. */
- if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
- if (oldmode || LDT_empty(&ldt_info)) {
- memset(&ldt, 0, sizeof(ldt));
- goto install;
+ if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
+ LDT_empty(&ldt_info)) {
+ /* The user wants to clear the entry. */
+ memset(&ldt, 0, sizeof(ldt));
+ } else {
+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+ error = -EINVAL;
+ goto out;
}
+
+ fill_ldt(&ldt, &ldt_info);
+ if (oldmode)
+ ldt.avl = 0;
}
- if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
- error = -EINVAL;
+ mutex_lock(&mm->context.lock);
+
+ old_ldt = mm->context.ldt;
+ oldsize = old_ldt ? old_ldt->size : 0;
+ newsize = max((int)(ldt_info.entry_number + 1), oldsize);
+
+ error = -ENOMEM;
+ new_ldt = alloc_ldt_struct(newsize);
+ if (!new_ldt)
goto out_unlock;
- }
- fill_ldt(&ldt, &ldt_info);
- if (oldmode)
- ldt.avl = 0;
-
- /* Install the new entry ... */
-install:
- error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
+ if (old_ldt)
+ memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE);
+ new_ldt->entries[ldt_info.entry_number] = ldt;
+ finalize_ldt_struct(new_ldt);
+
+ install_ldt(mm, new_ldt);
+ free_ldt_struct(old_ldt);
+ error = 0;
out_unlock:
mutex_unlock(&mm->context.lock);
--- a/arch/x86/kernel/mpparse-xen.c
+++ b/arch/x86/kernel/mpparse-xen.c
@@ -19,8 +19,8 @@
#include <linux/module.h>
#include <linux/smp.h>
#include <linux/pci.h>
-#include <linux/irqdomain.h>
+#include <asm/irqdomain.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
#include <asm/pgalloc.h>
@@ -125,11 +125,6 @@ static void __init MP_bus_info(struct mp
pr_warn("Unknown bustype %s - ignoring\n", str);
}
-static struct irq_domain_ops mp_ioapic_irqdomain_ops = {
- .map = mp_irqdomain_map,
- .unmap = mp_irqdomain_unmap,
-};
-
static void __init MP_ioapic_info(struct mpc_ioapic *m)
{
struct ioapic_domain_cfg cfg = {
--- a/arch/x86/kernel/pci-dma-xen.c
+++ b/arch/x86/kernel/pci-dma-xen.c
@@ -198,6 +198,51 @@ void dma_generic_free_coherent(struct de
}
}
+void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp, struct dma_attrs *attrs)
+{
+ struct dma_map_ops *ops = get_dma_ops(dev);
+ void *memory;
+
+ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+ if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
+ return memory;
+
+ if (!dev)
+ dev = &x86_dma_fallback_dev;
+
+ if (!is_device_dma_capable(dev))
+ return NULL;
+
+ if (!ops->alloc)
+ return NULL;
+
+ memory = ops->alloc(dev, size, dma_handle,
+ dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
+ debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
+
+ return memory;
+}
+EXPORT_SYMBOL(dma_alloc_attrs);
+
+void dma_free_attrs(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t bus,
+ struct dma_attrs *attrs)
+{
+ struct dma_map_ops *ops = get_dma_ops(dev);
+
+ WARN_ON(irqs_disabled()); /* for portability */
+
+ if (dma_release_from_coherent(dev, get_order(size), vaddr))
+ return;
+
+ debug_dma_free_coherent(dev, size, vaddr, bus);
+ if (ops->free)
+ ops->free(dev, size, vaddr, bus, attrs);
+}
+EXPORT_SYMBOL(dma_free_attrs);
+
/*
* See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
* parameter documentation.
--- a/arch/x86/kernel/process-xen.c
+++ b/arch/x86/kernel/process-xen.c
@@ -25,8 +25,7 @@
#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/mwait.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
#include <asm/debugreg.h>
#include <asm/nmi.h>
#include <asm/tlbflush.h>
@@ -60,6 +59,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(
#endif
};
EXPORT_PER_CPU_SYMBOL(cpu_tss);
+#elif defined(CONFIG_X86_64_XEN)
+__visible DEFINE_PER_CPU(unsigned long, cpu_sp0) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_sp0);
#endif
#ifdef CONFIG_X86_64
@@ -79,47 +81,15 @@ void idle_notifier_unregister(struct not
EXPORT_SYMBOL_GPL(idle_notifier_unregister);
#endif
-struct kmem_cache *task_xstate_cachep;
-EXPORT_SYMBOL_GPL(task_xstate_cachep);
-
/*
* this gets called so that we can store lazy state into memory and copy the
* current task into the new thread.
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- *dst = *src;
-
- dst->thread.fpu_counter = 0;
- dst->thread.fpu.has_fpu = 0;
- dst->thread.fpu.state = NULL;
- task_disable_lazy_fpu_restore(dst);
- if (tsk_used_math(src)) {
- int err = fpu_alloc(&dst->thread.fpu);
- if (err)
- return err;
- fpu_copy(dst, src);
- }
- return 0;
-}
-
-void free_thread_xstate(struct task_struct *tsk)
-{
- fpu_free(&tsk->thread.fpu);
-}
-
-void arch_release_task_struct(struct task_struct *tsk)
-{
- free_thread_xstate(tsk);
-}
+ memcpy(dst, src, arch_task_struct_size);
-void arch_task_cache_init(void)
-{
- task_xstate_cachep =
- kmem_cache_create("task_xstate", xstate_size,
- __alignof__(union thread_xstate),
- SLAB_PANIC | SLAB_NOTRACK, NULL);
- setup_xstate_comp();
+ return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
}
/*
@@ -130,6 +100,7 @@ void exit_thread(void)
struct task_struct *me = current;
struct thread_struct *t = &me->thread;
unsigned long *bp = t->io_bitmap_ptr;
+ struct fpu *fpu = &t->fpu;
if (bp) {
struct physdev_set_iobitmap set_iobitmap;
@@ -146,7 +117,7 @@ void exit_thread(void)
kfree(bp);
}
- drop_fpu(me);
+ fpu__drop(fpu);
}
void flush_thread(void)
@@ -156,19 +127,7 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- if (!use_eager_fpu()) {
- /* FPU state will be reallocated lazily at the first use. */
- drop_fpu(tsk);
- free_thread_xstate(tsk);
- } else {
- if (!tsk_used_math(tsk)) {
- /* kthread execs. TODO: cleanup this horror. */
- if (WARN_ON(init_fpu(tsk)))
- force_sig(SIGKILL, tsk);
- user_fpu_begin();
- }
- restore_init_xstate();
- }
+ fpu__clear(&tsk->thread.fpu);
}
static void hard_disable_TSC(void)
@@ -437,14 +396,14 @@ static int prefer_mwait_c1_over_halt(con
}
/*
- * MONITOR/MWAIT with no hints, used for default default C1 state.
- * This invokes MWAIT with interrutps enabled and no flags,
- * which is backwards compatible with the original MWAIT implementation.
+ * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
+ * with interrupts enabled and no flags, which is backwards compatible with the
+ * original MWAIT implementation.
*/
-
static void mwait_idle(void)
{
if (!current_set_polling_and_test()) {
+ trace_cpu_idle_rcuidle(1, smp_processor_id());
if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
smp_mb(); /* quirk */
clflush((void *)¤t_thread_info()->flags);
@@ -456,6 +415,7 @@ static void mwait_idle(void)
__sti_mwait(0, 0);
else
local_irq_enable();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
} else {
local_irq_enable();
}
--- a/arch/x86/kernel/process_32-xen.c
+++ b/arch/x86/kernel/process_32-xen.c
@@ -39,8 +39,7 @@
#include <asm/pgtable.h>
#include <asm/ldt.h>
#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
#include <asm/desc.h>
#ifdef CONFIG_MATH_EMULATION
#include <asm/math_emu.h>
@@ -132,8 +131,8 @@ void release_thread(struct task_struct *
release_vm86_irqs(dead_task);
}
-int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
{
struct pt_regs *childregs = task_pt_regs(p);
struct task_struct *tsk;
@@ -192,7 +191,7 @@ int copy_thread(unsigned long clone_flag
*/
if (clone_flags & CLONE_SETTLS)
err = do_set_thread_area(p, -1,
- (struct user_desc __user *)childregs->si, 0);
+ (struct user_desc __user *)tls, 0);
p->thread.iopl = current->thread.iopl;
@@ -250,12 +249,14 @@ __visible __notrace_funcgraph struct tas
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
- *next = &next_p->thread;
+ *next = &next_p->thread;
+ struct fpu *prev_fpu = &prev->fpu;
+ struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id(), cr0_ts;
#ifndef CONFIG_X86_NO_TSS
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
#endif
- fpu_switch_t fpu;
+ fpu_switch_t fpu_switch;
#if CONFIG_XEN_COMPAT > 0x030002
struct physdev_set_iopl iopl_op;
struct physdev_set_iobitmap iobmp_op;
@@ -268,7 +269,7 @@ __switch_to(struct task_struct *prev_p,
/* XEN NOTE: FS/GS saved in switch_mm(), not here. */
- fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl);
+ fpu_switch = xen_switch_fpu_prepare(prev_fpu, next_fpu, cpu, &mcl);
/*
* Reload sp0.
@@ -364,18 +365,15 @@ __switch_to(struct task_struct *prev_p,
* Leave lazy mode, flushing any hypercalls made here.
* This must be done before restoring TLS segments so
* the GDT and LDT are properly updated, and must be
- * done before math_state_restore, so the TS bit is up
+ * done before fpu__restore(), so the TS bit is up
* to date.
*/
arch_end_context_switch(next_p);
/*
- * Reload kernel_stack and current_top_of_stack. This changes
+ * Reload cpu_current_top_of_stack. This changes
* current_thread_info().
*/
- this_cpu_write(kernel_stack,
- (unsigned long)task_stack_page(next_p) +
- THREAD_SIZE);
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
@@ -386,7 +384,7 @@ __switch_to(struct task_struct *prev_p,
if (prev->gs | next->gs)
lazy_load_gs(next->gs);
- switch_fpu_finish(next_p, fpu);
+ switch_fpu_finish(next_fpu, fpu_switch);
this_cpu_write(current_task, next_p);
--- a/arch/x86/kernel/process_64-xen.c
+++ b/arch/x86/kernel/process_64-xen.c
@@ -41,8 +41,7 @@
#include <asm/pgtable.h>
#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
#include <asm/mmu_context.h>
#include <asm/prctl.h>
#include <xen/interface/physdev.h>
@@ -131,11 +130,11 @@ EXPORT_SYMBOL(xen_load_gs_index);
void release_thread(struct task_struct *dead_task)
{
if (dead_task->mm) {
- if (dead_task->mm->context.size) {
+ if (dead_task->mm->context.ldt) {
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
dead_task->mm->context.ldt,
- dead_task->mm->context.size);
+ dead_task->mm->context.ldt->size);
BUG();
}
}
@@ -160,8 +159,8 @@ static inline u32 read_32bit_tls(struct
return get_desc_base(&t->thread.tls_array[tls]);
}
-int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
{
int err;
struct pt_regs *childregs;
@@ -217,10 +216,10 @@ int copy_thread(unsigned long clone_flag
#ifdef CONFIG_IA32_EMULATION
if (is_ia32_task())
err = do_set_thread_area(p, -1,
- (struct user_desc __user *)childregs->si, 0);
+ (struct user_desc __user *)tls, 0);
else
#endif
- err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+ err = do_arch_prctl(p, ARCH_SET_FS, tls);
if (err)
goto out;
}
@@ -285,11 +284,13 @@ __switch_to(struct task_struct *prev_p,
{
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
+ struct fpu *prev_fpu = &prev->fpu;
+ struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id(), cr0_ts;
#ifndef CONFIG_X86_NO_TSS
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
#endif
- fpu_switch_t fpu;
+ fpu_switch_t fpu_switch;
#if CONFIG_XEN_COMPAT > 0x030002
struct physdev_set_iopl iopl_op;
struct physdev_set_iobitmap iobmp_op;
@@ -300,7 +301,7 @@ __switch_to(struct task_struct *prev_p,
#endif
multicall_entry_t _mcl[8], *mcl = _mcl;
- fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl);
+ fpu_switch = xen_switch_fpu_prepare(prev_fpu, next_fpu, cpu, &mcl);
/* Reload sp0. This is load_sp0(tss, next) with a multicall. */
mcl->op = __HYPERVISOR_stack_switch;
@@ -399,7 +400,7 @@ __switch_to(struct task_struct *prev_p,
* Leave lazy mode, flushing any hypercalls made here. This
* must be done after loading TLS entries in the GDT but before
* loading segments that might reference them, and and it must
- * be done before math_state_restore, so the TS bit is up to
+ * be done before fpu__restore(), so the TS bit is up to
* date.
*/
arch_end_context_switch(next_p);
@@ -451,7 +452,7 @@ __switch_to(struct task_struct *prev_p,
if (next->gs)
WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
- switch_fpu_finish(next_p, fpu);
+ switch_fpu_finish(next_fpu, fpu_switch);
/*
* Switch the PDA context.
@@ -467,7 +468,7 @@ __switch_to(struct task_struct *prev_p,
this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
/* This changes current_thread_info(). */
- this_cpu_write(kernel_stack,
+ this_cpu_write(cpu_sp0,
(unsigned long)task_stack_page(next_p) + THREAD_SIZE);
/*
--- a/arch/x86/kernel/setup-xen.c
+++ b/arch/x86/kernel/setup-xen.c
@@ -528,19 +528,18 @@ static void __init e820_reserve_setup_da
#ifndef CONFIG_XEN
struct setup_data *data;
u64 pa_data;
- int found = 0;
pa_data = boot_params.hdr.setup_data;
+ if (!pa_data)
+ return;
+
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
e820_update_range(pa_data, sizeof(*data)+data->len,
E820_RAM, E820_RESERVED_KERN);
- found = 1;
pa_data = data->next;
early_memunmap(data, sizeof(*data));
}
- if (!found)
- return;
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
memcpy(&e820_saved, &e820, sizeof(struct e820map));
@@ -601,12 +600,14 @@ static void __init reserve_crashkernel_l
if (ret != 0) {
/*
* two parts from lib/swiotlb.c:
- * swiotlb size: user specified with swiotlb= or default.
- * swiotlb overflow buffer: now is hardcoded to 32k.
- * We round it to 8M for other buffers that
- * may need to stay low too.
+ * -swiotlb size: user-specified with swiotlb= or default.
+ *
+ * -swiotlb overflow buffer: now hardcoded to 32k. We round it
+ * to 8M for other buffers that may need to stay low too. Also
+ * make sure we allocate enough extra low memory so that we
+ * don't run out of DMA buffers for 32-bit devices.
*/
- low_size = swiotlb_size_or_default() + (8UL<<20);
+ low_size = max(swiotlb_size_or_default() + (8UL<<20), 256UL<<20);
auto_set = true;
} else {
/* passed with crashkernel=0,low ? */
@@ -912,7 +913,7 @@ dump_kernel_offset(struct notifier_block
{
if (kaslr_enabled()) {
pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
- (unsigned long)&_text - __START_KERNEL,
+ kaslr_offset(),
__START_KERNEL,
__START_KERNEL_map,
MODULES_VADDR-1);
@@ -1248,6 +1249,9 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
+ if (efi_enabled(EFI_BOOT))
+ efi_find_mirror();
+
/*
* The EFI specification says that boot service code won't be called
* after ExitBootServices(). This is, in fact, a lie.
@@ -1458,8 +1462,7 @@ void __init setup_arch(char **cmdline_p)
#ifndef CONFIG_XEN
init_apic_mappings();
- if (x86_io_apic_ops.init)
- x86_io_apic_ops.init();
+ io_apic_init_mappings();
kvm_guest_init();
--- a/arch/x86/kernel/traps-xen.c
+++ b/arch/x86/kernel/traps-xen.c
@@ -54,12 +54,13 @@
#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
#include <asm/mce.h>
#include <asm/fixmap.h>
#include <asm/mach_traps.h>
#include <asm/alternative.h>
+#include <asm/fpu/xstate.h>
+#include <asm/trace/mpx.h>
#include <asm/mpx.h>
#ifdef CONFIG_X86_64
@@ -74,8 +75,7 @@ gate_desc debug_idt_table[NR_VECTORS] __
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
-
-asmlinkage int system_call(void);
+#include <asm/proto.h>
#endif
#ifndef CONFIG_X86_NO_IDT
@@ -377,10 +377,8 @@ dotraplinkage void do_double_fault(struc
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
{
- struct task_struct *tsk = current;
- struct xsave_struct *xsave_buf;
enum ctx_state prev_state;
- struct bndcsr *bndcsr;
+ const struct bndcsr *bndcsr;
siginfo_t *info;
prev_state = exception_enter();
@@ -399,15 +397,15 @@ dotraplinkage void do_bounds(struct pt_r
/*
* We need to look at BNDSTATUS to resolve this exception.
- * It is not directly accessible, though, so we need to
- * do an xsave and then pull it out of the xsave buffer.
+ * A NULL here might mean that it is in its 'init state',
+ * which is all zeros which indicates MPX was not
+ * responsible for the exception.
*/
- fpu_save_init(&tsk->thread.fpu);
- xsave_buf = &(tsk->thread.fpu.state->xsave);
- bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
+ bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
if (!bndcsr)
goto exit_trap;
+ trace_bounds_exception_mpx(bndcsr);
/*
* The error code field of the BNDSTATUS register communicates status
* information of a bound range exception #BR or operation involving
@@ -415,11 +413,11 @@ dotraplinkage void do_bounds(struct pt_r
*/
switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
case 2: /* Bound directory has invalid entry. */
- if (mpx_handle_bd_fault(xsave_buf))
+ if (mpx_handle_bd_fault())
goto exit_trap;
break; /* Success, it was handled */
case 1: /* Bound violation. */
- info = mpx_generate_siginfo(regs, xsave_buf);
+ info = mpx_generate_siginfo(regs);
if (IS_ERR(info)) {
/*
* We failed to decode the MPX instruction. Act as if
@@ -715,8 +713,8 @@ NOKPROBE_SYMBOL(do_debug);
static void math_error(struct pt_regs *regs, int error_code, int trapnr)
{
struct task_struct *task = current;
+ struct fpu *fpu = &task->thread.fpu;
siginfo_t info;
- unsigned short err;
char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
"simd exception";
@@ -724,8 +722,7 @@ static void math_error(struct pt_regs *r
return;
conditional_sti(regs);
- if (!user_mode(regs))
- {
+ if (!user_mode(regs)) {
if (!fixup_exception(regs)) {
task->thread.error_code = error_code;
task->thread.trap_nr = trapnr;
@@ -737,62 +734,20 @@ static void math_error(struct pt_regs *r
/*
* Save the info for the exception handler and clear the error.
*/
- unlazy_fpu(task);
- task->thread.trap_nr = trapnr;
+ fpu__save(fpu);
+
+ task->thread.trap_nr = trapnr;
task->thread.error_code = error_code;
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
- if (trapnr == X86_TRAP_MF) {
- unsigned short cwd, swd;
- /*
- * (~cwd & swd) will mask out exceptions that are not set to unmasked
- * status. 0x3f is the exception bits in these regs, 0x200 is the
- * C1 reg you need in case of a stack fault, 0x040 is the stack
- * fault bit. We should only be taking one exception at a time,
- * so if this combination doesn't produce any single exception,
- * then we have a bad program that isn't synchronizing its FPU usage
- * and it will suffer the consequences since we won't be able to
- * fully reproduce the context of the exception
- */
- cwd = get_fpu_cwd(task);
- swd = get_fpu_swd(task);
+ info.si_signo = SIGFPE;
+ info.si_errno = 0;
+ info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
- err = swd & ~cwd;
- } else {
- /*
- * The SIMD FPU exceptions are handled a little differently, as there
- * is only a single status/control register. Thus, to determine which
- * unmasked exception was caught we must mask the exception mask bits
- * at 0x1f80, and then use these to mask the exception bits at 0x3f.
- */
- unsigned short mxcsr = get_fpu_mxcsr(task);
- err = ~(mxcsr >> 7) & mxcsr;
- }
+ info.si_code = fpu__exception_code(fpu, trapnr);
- if (err & 0x001) { /* Invalid op */
- /*
- * swd & 0x240 == 0x040: Stack Underflow
- * swd & 0x240 == 0x240: Stack Overflow
- * User must clear the SF bit (0x40) if set
- */
- info.si_code = FPE_FLTINV;
- } else if (err & 0x004) { /* Divide by Zero */
- info.si_code = FPE_FLTDIV;
- } else if (err & 0x008) { /* Overflow */
- info.si_code = FPE_FLTOVF;
- } else if (err & 0x012) { /* Denormal, Underflow */
- info.si_code = FPE_FLTUND;
- } else if (err & 0x020) { /* Precision */
- info.si_code = FPE_FLTRES;
- } else {
- /*
- * If we're using IRQ 13, or supposedly even some trap
- * X86_TRAP_MF implementations, it's possible
- * we get a spurious trap, which is not an error.
- */
+ /* Retry when we get spurious exceptions: */
+ if (!info.si_code)
return;
- }
+
force_sig_info(SIGFPE, &info, task);
}
@@ -820,70 +775,9 @@ dotraplinkage void
do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
{
conditional_sti(regs);
-#if 0
- /* No need to warn about this any longer. */
- pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-#endif
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void)
-{
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void)
-{
}
#endif /* CONFIG_XEN */
-/*
- * 'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
- * Don't touch unless you *really* know how it works.
- *
- * Must be called with kernel preemption disabled (eg with local
- * local interrupts as in the case of do_device_not_available).
- */
-static void _math_state_restore(void)
-{
- struct task_struct *tsk = current;
-
- if (!tsk_used_math(tsk)) {
- stts();
- local_irq_enable();
- /*
- * does a slab alloc which can sleep
- */
- if (init_fpu(tsk)) {
- /*
- * ran out of memory!
- */
- do_group_exit(SIGKILL);
- return;
- }
- local_irq_disable();
- clts();
- }
-
- /* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */
- kernel_fpu_disable();
- xen_thread_fpu_begin(tsk, NULL);
- if (unlikely(restore_fpu_checking(tsk))) {
- fpu_reset_state(tsk);
- force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
- } else {
- tsk->thread.fpu_counter++;
- }
- kernel_fpu_enable();
-}
-
-void math_state_restore(void)
-{
- clts();
- _math_state_restore();
-}
-
dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
@@ -906,7 +800,7 @@ do_device_not_available(struct pt_regs *
#endif
/* NB. 'clts' is done for us by Xen during virtual trap. */
__this_cpu_and(xen_x86_cr0, ~X86_CR0_TS);
- _math_state_restore(); /* interrupts still off */
+ fpu__restore(¤t->thread.fpu); /* interrupts still off */
#ifdef CONFIG_X86_32
conditional_sti(regs);
#endif
@@ -978,9 +872,9 @@ static const trap_info_t trap_table[] =
{ X86_TRAP_XF, 0|X, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
#ifdef CONFIG_X86_32
{ X86_TRAP_SPURIOUS, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment },
- { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call },
+ { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)entry_INT80_32 },
#elif defined(CONFIG_IA32_EMULATION)
- { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall },
+ { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)entry_INT80_compat },
#endif
{ }
};
--- a/arch/x86/lib/cmpxchg16b_emu-xen.S
+++ b/arch/x86/lib/cmpxchg16b_emu-xen.S
@@ -6,7 +6,6 @@
*
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/irqflags.h>
#include <asm/percpu.h>
#include <xen/interface/xen.h>
@@ -23,7 +22,6 @@
* %al : Operation successful
*/
ENTRY(this_cpu_cmpxchg16b_emu)
- CFI_STARTPROC
#
# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
@@ -34,8 +32,7 @@ ENTRY(this_cpu_cmpxchg16b_emu)
# *atomic* on a single cpu (as provided by the this_cpu_xx class of
# macros).
#
- pushq_cfi %rbp
- CFI_REL_OFFSET rbp, 0
+ pushq %rbp
#ifndef __REG_si
# error Out of sync with asm/irqflags.h!
#endif
@@ -43,8 +40,7 @@ ENTRY(this_cpu_cmpxchg16b_emu)
#define __REG_si %rbp
#define esi ebp
GET_VCPU_INFO
- pushq_cfi %rdi
- CFI_REL_OFFSET rdi, 0
+ pushq %rdi
__SAVE_INTERRUPTS(edi)
__DISABLE_INTERRUPTS
@@ -61,31 +57,24 @@ ENTRY(this_cpu_cmpxchg16b_emu)
test %edi, %edi
jz .Lcheck_events
- CFI_REMEMBER_STATE
.Lexit:
- popq_cfi %rdi
- CFI_RESTORE rdi
- popq_cfi %rbp
- CFI_RESTORE rbp
+ popq %rdi
+ popq %rbp
ret
- CFI_RESTORE_STATE
.Lcheck_events:
__ENABLE_INTERRUPTS
__TEST_PENDING
jz .Lexit
#undef __REG_si
#undef esi
- pushq_cfi %rsi
- CFI_REL_OFFSET rsi, 0
- pushq_cfi %rax
+ pushq %rsi
+ pushq %rax
/* %edi is zero already */
xor %esi, %esi
call hypercall_page + __HYPERVISOR_xen_version * 32
- popq_cfi %rax
- popq_cfi %rsi
- CFI_RESTORE rsi
+ popq %rax
+ popq %rsi
jmp .Lexit
- CFI_ENDPROC
ENDPROC(this_cpu_cmpxchg16b_emu)
--- a/arch/x86/mm/fault-xen.c
+++ b/arch/x86/mm/fault-xen.c
@@ -13,6 +13,7 @@
#include <linux/hugetlb.h> /* hstate_index_to_shift */
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
+#include <linux/uaccess.h> /* faulthandler_disabled() */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -1152,9 +1153,9 @@ __do_page_fault(struct pt_regs *regs, un
/*
* If we're in an interrupt, have no user context or are running
- * in an atomic region then we must not take the fault:
+ * in a region with pagefaults disabled then we must not take the fault
*/
- if (unlikely(in_atomic() || !mm)) {
+ if (unlikely(faulthandler_disabled() || !mm)) {
bad_area_nosemaphore(regs, error_code, address);
return;
}
--- a/arch/x86/mm/highmem_32-xen.c
+++ b/arch/x86/mm/highmem_32-xen.c
@@ -35,7 +35,7 @@ void *kmap_atomic_prot(struct page *page
unsigned long vaddr;
int idx, type;
- /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+ preempt_disable();
pagefault_disable();
if (!PageHighMem(page))
@@ -100,6 +100,7 @@ void __kunmap_atomic(void *kvaddr)
#endif
pagefault_enable();
+ preempt_enable();
}
EXPORT_SYMBOL(__kunmap_atomic);
--- a/arch/x86/mm/init_32-xen.c
+++ b/arch/x86/mm/init_32-xen.c
@@ -469,7 +469,7 @@ void __init add_highpages_with_active_re
phys_addr_t start, end;
u64 i;
- for_each_free_mem_range(i, nid, &start, &end, NULL) {
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
start_pfn, end_pfn);
unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
--- a/arch/x86/mm/iomap_32-xen.c
+++ b/arch/x86/mm/iomap_32-xen.c
@@ -60,6 +60,7 @@ void *kmap_atomic_prot_pfn(unsigned long
unsigned long vaddr;
int idx, type;
+ preempt_disable();
pagefault_disable();
type = kmap_atomic_idx_push();
@@ -78,13 +79,13 @@ void __iomem *
iomap_atomic_prot_pfn(unsigned long mfn, pgprot_t prot)
{
/*
- * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
- * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
- * MTRR is UC or WC. UC_MINUS gets the real intention, of the
- * user, which is "WC if the MTRR is WC, UC if you can't do that."
+ * For non-PAT systems, translate non-WB request to UC- just in
+ * case the caller set the PWT bit to prot directly without using
+ * pgprot_writecombine(). UC- translates to uncached if the MTRR
+ * is UC or WC. UC- gets the real intention, of the user, which is
+ * "WC if the MTRR is WC, UC if you can't do that."
*/
- if (!pat_enabled && pgprot_val(prot) ==
- (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC)))
+ if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB)
prot = __pgprot(__PAGE_KERNEL |
cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
@@ -119,5 +120,6 @@ iounmap_atomic(void __iomem *kvaddr)
}
pagefault_enable();
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(iounmap_atomic);
--- a/arch/x86/mm/ioremap-xen.c
+++ b/arch/x86/mm/ioremap-xen.c
@@ -185,6 +185,9 @@ static int ioremap_change_attr(unsigned
case _PAGE_CACHE_MODE_WC:
err = _set_memory_wc(vaddr, nrpages);
break;
+ case _PAGE_CACHE_MODE_WT:
+ err = _set_memory_wt(vaddr, nrpages);
+ break;
case _PAGE_CACHE_MODE_WB:
err = _set_memory_wb(vaddr, nrpages);
break;
@@ -212,6 +215,25 @@ int ioremap_check_change_attr(unsigned l
return rc;
}
+static int __ioremap_check_ram(unsigned long start_mfn, unsigned long nr_pages,
+ void *arg)
+{
+ domid_t *domid = arg;
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; ++i) {
+ unsigned long pfn = mfn_to_local_pfn(start_mfn + i);
+
+ if (pfn_valid(pfn)) {
+ if (!PageReserved(pfn_to_page(pfn)))
+ return 1;
+ *domid = DOMID_SELF;
+ }
+ }
+
+ return 0;
+}
+
/*
* Remap an arbitrary physical address space into the kernel virtual
* address space. It transparently creates kernel huge I/O mapping when
@@ -239,7 +261,6 @@ static void __iomem *__ioremap_caller(re
int retval;
domid_t domid = DOMID_IO;
void __iomem *ret_addr;
- int ram_region;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
@@ -262,29 +283,18 @@ static void __iomem *__ioremap_caller(re
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- /* First check if whole region can be identified as RAM or not */
- ram_region = is_initial_xendomain() ? region_is_ram(phys_addr, size)
- : -1;
- if (ram_region > 0) {
- WARN_ONCE(1, "ioremap on RAM at %#Lx - %#Lx\n",
- (unsigned long long)phys_addr,
- (unsigned long long)last_addr);
+ mfn = phys_addr >> PAGE_SHIFT;
+ last_mfn = last_addr >> PAGE_SHIFT;
+ if (is_initial_xendomain())
+ retval = walk_system_ram_range(mfn, last_mfn - mfn + 1, &domid,
+ __ioremap_check_ram);
+ else
+ retval = __ioremap_check_ram(mfn, last_mfn - mfn + 1, &domid);
+ if (retval == 1) {
+ WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
+ &phys_addr, &last_addr);
return NULL;
}
-
- /* If could not be identified(-1), check page by page */
- if (ram_region < 0) {
- last_mfn = PFN_DOWN(last_addr);
- for (mfn = PFN_DOWN(phys_addr); mfn <= last_mfn; mfn++) {
- unsigned long pfn = mfn_to_local_pfn(mfn);
-
- if (pfn_valid(pfn)) {
- if (!PageReserved(pfn_to_page(pfn)))
- return NULL;
- domid = DOMID_SELF;
- }
- }
- }
WARN_ON_ONCE(domid == DOMID_SELF);
/*
@@ -328,6 +338,10 @@ static void __iomem *__ioremap_caller(re
prot = __pgprot(pgprot_val(prot) |
cachemode2protval(_PAGE_CACHE_MODE_WC));
break;
+ case _PAGE_CACHE_MODE_WT:
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WT));
+ break;
case _PAGE_CACHE_MODE_WB:
break;
}
@@ -391,10 +405,11 @@ void __iomem *ioremap_nocache(resource_s
{
/*
* Ideally, this should be:
- * pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
+ * pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
*
* Till we fix all X drivers to use ioremap_wc(), we will use
- * UC MINUS.
+ * UC MINUS. Drivers that are certain they need or can already
+ * be converted over to strong UC can use ioremap_uc().
*/
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
@@ -404,6 +419,39 @@ void __iomem *ioremap_nocache(resource_s
EXPORT_SYMBOL(ioremap_nocache);
/**
+ * ioremap_uc - map bus memory into CPU space as strongly uncachable
+ * @phys_addr: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap_uc performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked with a strong
+ * preference as completely uncachable on the CPU when possible. For non-PAT
+ * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT
+ * systems this will set the PAT entry for the pages as strong UC. This call
+ * will honor existing caching rules from things like the PCI bus. Note that
+ * there are other caches and buffers on many busses. In particular driver
+ * authors should read up on PCI writes.
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
+{
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
+
+ return __ioremap_caller(phys_addr, size, pcm,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(ioremap_uc);
+
+/**
* ioremap_wc - map memory into CPU space write combined
* @phys_addr: bus address of the memory
* @size: size of the resource to map
@@ -415,14 +463,28 @@ EXPORT_SYMBOL(ioremap_nocache);
*/
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
- if (pat_enabled)
- return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
__builtin_return_address(0));
- else
- return ioremap_nocache(phys_addr, size);
}
EXPORT_SYMBOL(ioremap_wc);
+/**
+ * ioremap_wt - map memory into CPU space write through
+ * @phys_addr: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write through.
+ * Write through stores data into memory while keeping the cache up-to-date.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
+{
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
@@ -488,7 +550,7 @@ void iounmap(volatile void __iomem *addr
EXPORT_SYMBOL(iounmap);
#ifndef CONFIG_XEN
-int arch_ioremap_pud_supported(void)
+int __init arch_ioremap_pud_supported(void)
{
#ifdef CONFIG_X86_64
return cpu_has_gbpages;
@@ -497,7 +559,7 @@ int arch_ioremap_pud_supported(void)
#endif
}
-int arch_ioremap_pmd_supported(void)
+int __init arch_ioremap_pmd_supported(void)
{
return cpu_has_pse;
}
@@ -510,18 +572,18 @@ void *xlate_dev_mem_ptr(phys_addr_t phys
{
unsigned long start = phys & PAGE_MASK;
unsigned long offset = phys & ~PAGE_MASK;
- unsigned long vaddr;
+ void *vaddr;
/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
if (page_is_ram(start >> PAGE_SHIFT))
return __va(phys);
- vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE);
+ vaddr = ioremap_cache(start, PAGE_SIZE);
/* Only add the offset on success and return NULL if the ioremap() failed: */
if (vaddr)
vaddr += offset;
- return (void *)vaddr;
+ return vaddr;
}
void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
@@ -530,7 +592,6 @@ void unxlate_dev_mem_ptr(phys_addr_t phy
return;
iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
- return;
}
#endif
--- a/arch/x86/mm/pageattr-xen.c
+++ b/arch/x86/mm/pageattr-xen.c
@@ -14,6 +14,7 @@
#include <linux/percpu.h>
#include <linux/gfp.h>
#include <linux/pci.h>
+#include <linux/vmalloc.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -129,16 +130,15 @@ within(unsigned long addr, unsigned long
*/
void clflush_cache_range(void *vaddr, unsigned int size)
{
- void *vend = vaddr + size - 1;
+ unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ void *vend = vaddr + size;
+ void *p;
mb();
- for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
- clflushopt(vaddr);
- /*
- * Flush any possible final partial cacheline:
- */
- clflushopt(vend);
+ for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+ p < vend; p += boot_cpu_data.x86_clflush_size)
+ clflushopt(p);
mb();
}
@@ -418,13 +418,11 @@ phys_addr_t slow_virt_to_phys(void *__vi
phys_addr_t phys_addr;
unsigned long offset;
enum pg_level level;
- unsigned long psize;
unsigned long pmask;
pte_t *pte;
pte = lookup_address(virt_addr, &level);
BUG_ON(!pte);
- psize = page_level_size(level);
pmask = page_level_mask(level);
offset = virt_addr & ~pmask;
phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
@@ -1559,6 +1557,9 @@ int _set_memory_uc(unsigned long addr, i
{
/*
* for now UC MINUS. see comments in ioremap_nocache()
+ * If you really need strong UC use ioremap_uc(), but note
+ * that you cannot override IO areas with set_memory_*() as
+ * these helpers cannot work with IO memory.
*/
return change_page_attr_set(&addr, numpages,
cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
@@ -1593,12 +1594,10 @@ EXPORT_SYMBOL(set_memory_uc);
static int _set_memory_array(unsigned long *addr, int addrinarray,
enum page_cache_mode new_type)
{
+ enum page_cache_mode set_type;
int i, j;
int ret;
- /*
- * for now UC MINUS. see comments in ioremap_nocache()
- */
for (i = 0; i < addrinarray; i++) {
ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
new_type, NULL);
@@ -1606,9 +1605,12 @@ static int _set_memory_array(unsigned lo
goto out_free;
}
+ /* If WC, set to UC- first and then WC */
+ set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+ _PAGE_CACHE_MODE_UC_MINUS : new_type;
+
ret = change_page_attr_set(addr, addrinarray,
- cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
- 1);
+ cachemode2pgprot(set_type), 1);
if (!ret && new_type == _PAGE_CACHE_MODE_WC)
ret = change_page_attr_set_clr(addr, addrinarray,
@@ -1640,6 +1642,12 @@ int set_memory_array_wc(unsigned long *a
}
EXPORT_SYMBOL(set_memory_array_wc);
+int set_memory_array_wt(unsigned long *addr, int addrinarray)
+{
+ return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_memory_array_wt);
+
int _set_memory_wc(unsigned long addr, int numpages)
{
int ret;
@@ -1662,27 +1670,42 @@ int set_memory_wc(unsigned long addr, in
{
int ret;
- if (!pat_enabled)
- return set_memory_uc(addr, numpages);
-
ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
_PAGE_CACHE_MODE_WC, NULL);
if (ret)
- goto out_err;
+ return ret;
ret = _set_memory_wc(addr, numpages);
if (ret)
- goto out_free;
-
- return 0;
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
-out_free:
- free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
-out_err:
return ret;
}
EXPORT_SYMBOL(set_memory_wc);
+int _set_memory_wt(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages,
+ cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
+}
+
+int set_memory_wt(unsigned long addr, int numpages)
+{
+ int ret;
+
+ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ _PAGE_CACHE_MODE_WT, NULL);
+ if (ret)
+ return ret;
+
+ ret = _set_memory_wt(addr, numpages);
+ if (ret)
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_memory_wt);
+
int _set_memory_wb(unsigned long addr, int numpages)
{
/* WB cache mode is hard wired to all cache attribute bits being 0 */
@@ -1773,6 +1796,7 @@ static int _set_pages_array(struct page
{
unsigned long start;
unsigned long end;
+ enum page_cache_mode set_type;
int i;
int free_idx;
int ret;
@@ -1786,8 +1810,12 @@ static int _set_pages_array(struct page
goto err_out;
}
+ /* If WC, set to UC- first and then WC */
+ set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+ _PAGE_CACHE_MODE_UC_MINUS : new_type;
+
ret = cpa_set_pages_array(pages, addrinarray,
- cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS));
+ cachemode2pgprot(set_type));
if (!ret && new_type == _PAGE_CACHE_MODE_WC)
ret = change_page_attr_set_clr(NULL, addrinarray,
cachemode2pgprot(
@@ -1821,6 +1849,12 @@ int set_pages_array_wc(struct page **pag
}
EXPORT_SYMBOL(set_pages_array_wc);
+int set_pages_array_wt(struct page **pages, int addrinarray)
+{
+ return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_pages_array_wt);
+
int set_pages_wb(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
--- a/arch/x86/mm/pat-xen.c
+++ b/arch/x86/mm/pat-xen.c
@@ -33,13 +33,17 @@
#include "pat_internal.h"
#include "mm_internal.h"
-#ifdef CONFIG_X86_PAT
-int __read_mostly pat_enabled = 1;
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static bool boot_cpu_done;
+
+static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT);
static inline void pat_disable(const char *reason)
{
- pat_enabled = 0;
- printk(KERN_INFO "%s\n", reason);
+ __pat_enabled = 0;
+ pr_info("x86/PAT: %s\n", reason);
}
static int __init nopat(char *str)
@@ -48,13 +52,12 @@ static int __init nopat(char *str)
return 0;
}
early_param("nopat", nopat);
-#else
-static inline void pat_disable(const char *reason)
+
+bool pat_enabled(void)
{
- (void)reason;
+ return !!__pat_enabled;
}
-#endif
-
+EXPORT_SYMBOL_GPL(pat_enabled);
int pat_debug_enable;
@@ -65,22 +68,24 @@ static int __init pat_debug_setup(char *
}
__setup("debugpat", pat_debug_setup);
-static u64 __read_mostly boot_pat_state;
-
#ifdef CONFIG_X86_PAT
/*
- * X86 PAT uses page flags WC and Uncached together to keep track of
- * memory type of pages that have backing page struct. X86 PAT supports 3
- * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and
- * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not
- * been changed from its default (value of -1 used to denote this).
- * Note we do not support _PAGE_CACHE_MODE_UC here.
+ * X86 PAT uses page flags arch_1 and uncached together to keep track of
+ * memory type of pages that have backing page struct.
+ *
+ * X86 PAT supports 4 different memory types:
+ * - _PAGE_CACHE_MODE_WB
+ * - _PAGE_CACHE_MODE_WC
+ * - _PAGE_CACHE_MODE_UC_MINUS
+ * - _PAGE_CACHE_MODE_WT
+ *
+ * _PAGE_CACHE_MODE_WB is the default type.
*/
-#define _PGMT_DEFAULT 0
+#define _PGMT_WB 0
#define _PGMT_WC (1UL << PG_arch_1)
#define _PGMT_UC_MINUS (1UL << PG_uncached)
-#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1)
#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1)
#define _PGMT_CLEAR_MASK (~_PGMT_MASK)
@@ -88,14 +93,14 @@ static inline enum page_cache_mode get_p
{
unsigned long pg_flags = pg->flags & _PGMT_MASK;
- if (pg_flags == _PGMT_DEFAULT)
- return -1;
+ if (pg_flags == _PGMT_WB)
+ return _PAGE_CACHE_MODE_WB;
else if (pg_flags == _PGMT_WC)
return _PAGE_CACHE_MODE_WC;
else if (pg_flags == _PGMT_UC_MINUS)
return _PAGE_CACHE_MODE_UC_MINUS;
else
- return _PAGE_CACHE_MODE_WB;
+ return _PAGE_CACHE_MODE_WT;
}
static inline void set_page_memtype(struct page *pg,
@@ -112,11 +117,12 @@ static inline void set_page_memtype(stru
case _PAGE_CACHE_MODE_UC_MINUS:
memtype_flags = _PGMT_UC_MINUS;
break;
- case _PAGE_CACHE_MODE_WB:
- memtype_flags = _PGMT_WB;
+ case _PAGE_CACHE_MODE_WT:
+ memtype_flags = _PGMT_WT;
break;
+ case _PAGE_CACHE_MODE_WB:
default:
- memtype_flags = _PGMT_DEFAULT;
+ memtype_flags = _PGMT_WB;
break;
}
@@ -174,88 +180,162 @@ static enum page_cache_mode pat_get_cach
* configuration.
* Using lower indices is preferred, so we start with highest index.
*/
-void pat_init_cache_modes(void)
+void pat_init_cache_modes(u64 pat)
{
- int i;
enum page_cache_mode cache;
char pat_msg[33];
- u64 pat;
+ int i;
- rdmsrl(MSR_IA32_CR_PAT, pat);
pat_msg[32] = 0;
for (i = 7; i >= 0; i--) {
cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
pat_msg + 4 * i);
update_cache_mode_entry(i, cache);
}
- pr_info("PAT configuration [0-7]: %s\n", pat_msg);
+ pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
}
#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
-void pat_init(void)
+static void pat_bsp_init(u64 pat)
{
- u64 pat;
- bool boot_cpu = !boot_pat_state;
+ u64 tmp_pat;
- if (!pat_enabled)
+ if (!cpu_has_pat) {
+ pat_disable("PAT not supported by CPU.");
return;
+ }
- if (!cpu_has_pat) {
- if (!boot_pat_state) {
- pat_disable("PAT not supported by CPU.");
- return;
- } else {
- /*
- * If this happens we are on a secondary CPU, but
- * switched to PAT on the boot CPU. We have no way to
- * undo PAT.
- */
- printk(KERN_ERR "PAT enabled, "
- "but not supported by secondary CPU\n");
- BUG();
- }
+ if (!pat_enabled())
+ goto done;
+
+ rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
+ if (!tmp_pat) {
+ pat_disable("PAT MSR is 0, disabled.");
+ return;
}
#ifndef CONFIG_XEN
- /* Set PWT to Write-Combining. All other bits stay the same */
/*
- * PTE encoding used in Linux:
- * PAT
- * |PCD
- * ||PWT
- * |||
- * 000 WB _PAGE_CACHE_WB
- * 001 WC _PAGE_CACHE_WC
- * 010 UC- _PAGE_CACHE_UC_MINUS
- * 011 UC _PAGE_CACHE_UC
- * PAT bit unused
+ * PAT settings are part of the hypervisor interface, and their
+ * assignment cannot be changed.
*/
- pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
- PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+ wrmsrl(MSR_IA32_CR_PAT, pat);
+#endif
- /* Boot CPU check */
- if (!boot_pat_state) {
- rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
- if (!boot_pat_state) {
- pat_disable("PAT read returns always zero, disabled.");
- return;
- }
+done:
+ pat_init_cache_modes(pat);
+}
+
+static void pat_ap_init(u64 pat)
+{
+ if (!pat_enabled())
+ return;
+
+ if (!cpu_has_pat) {
+ /*
+ * If this happens we are on a secondary CPU, but switched to
+ * PAT on the boot CPU. We have no way to undo PAT.
+ */
+ panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
}
+#ifndef CONFIG_XEN
wrmsrl(MSR_IA32_CR_PAT, pat);
-#else
- /*
- * PAT settings are part of the hypervisor interface, and their
- * assignment cannot be changed.
- */
- rdmsrl(MSR_IA32_CR_PAT, pat);
- if (!boot_pat_state)
- boot_pat_state = pat;
#endif
+}
+
+void pat_init(void)
+{
+ u64 pat;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (!pat_enabled()) {
+ /*
+ * No PAT. Emulate the PAT table that corresponds to the two
+ * cache bits, PWT (Write Through) and PCD (Cache Disable). This
+ * setup is the same as the BIOS default setup when the system
+ * has PAT but the "nopat" boot option has been specified. This
+ * emulated PAT table is used when MSR_IA32_CR_PAT returns 0.
+ *
+ * PTE encoding:
+ *
+ * PCD
+ * |PWT PAT
+ * || slot
+ * 00 0 WB : _PAGE_CACHE_MODE_WB
+ * 01 1 WT : _PAGE_CACHE_MODE_WT
+ * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 11 3 UC : _PAGE_CACHE_MODE_UC
+ *
+ * NOTE: When WC or WP is used, it is redirected to UC- per
+ * the default setup in __cachemode2pte_tbl[].
+ */
+ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
- if (boot_cpu)
- pat_init_cache_modes();
+ } else if ((c->x86_vendor == X86_VENDOR_INTEL) &&
+ (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
+ ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
+ /*
+ * PAT support with the lower four entries. Intel Pentium 2,
+ * 3, M, and 4 are affected by PAT errata, which makes the
+ * upper four entries unusable. To be on the safe side, we don't
+ * use those.
+ *
+ * PTE encoding:
+ * PAT
+ * |PCD
+ * ||PWT PAT
+ * ||| slot
+ * 000 0 WB : _PAGE_CACHE_MODE_WB
+ * 001 1 WC : _PAGE_CACHE_MODE_WC
+ * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 011 3 UC : _PAGE_CACHE_MODE_UC
+ * PAT bit unused
+ *
+ * NOTE: When WT or WP is used, it is redirected to UC- per
+ * the default setup in __cachemode2pte_tbl[].
+ */
+ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+ } else {
+ /*
+ * Full PAT support. We put WT in slot 7 to improve
+ * robustness in the presence of errata that might cause
+ * the high PAT bit to be ignored. This way, a buggy slot 7
+ * access will hit slot 3, and slot 3 is UC, so at worst
+ * we lose performance without causing a correctness issue.
+ * Pentium 4 erratum N46 is an example for such an erratum,
+ * although we try not to use PAT at all on affected CPUs.
+ *
+ * PTE encoding:
+ * PAT
+ * |PCD
+ * ||PWT PAT
+ * ||| slot
+ * 000 0 WB : _PAGE_CACHE_MODE_WB
+ * 001 1 WC : _PAGE_CACHE_MODE_WC
+ * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 011 3 UC : _PAGE_CACHE_MODE_UC
+ * 100 4 WB : Reserved
+ * 101 5 WC : Reserved
+ * 110 6 UC-: Reserved
+ * 111 7 WT : _PAGE_CACHE_MODE_WT
+ *
+ * The reserved slots are unused, but mapped to their
+ * corresponding types in the presence of PAT errata.
+ */
+ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
+ }
+
+ if (!boot_cpu_done) {
+ pat_bsp_init(pat);
+ boot_cpu_done = true;
+ } else {
+ pat_ap_init(pat);
+ }
}
#undef PAT
@@ -263,12 +343,17 @@ void pat_init(void)
static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end);
-static inline u8 _mtrr_type_lookup(u64 start, u64 end)
+static inline u8 _mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
{
+ int ret;
+
if (is_initial_xendomain())
- return mtrr_type_lookup(start, end);
- return pat_pagerange_is_ram(start, end) > 0
- ? MTRR_TYPE_WRCOMB : MTRR_TYPE_UNCACHABLE;
+ return mtrr_type_lookup(start, end, uniform);
+
+ ret = pat_pagerange_is_ram(start, end);
+ *uniform = ret >= 0;
+
+ return ret > 0 ? MTRR_TYPE_WRCOMB : MTRR_TYPE_UNCACHABLE;
}
#define mtrr_type_lookup _mtrr_type_lookup
@@ -287,9 +372,9 @@ static unsigned long pat_x_mtrr_type(u64
* request is for WB.
*/
if (req_type == _PAGE_CACHE_MODE_WB) {
- u8 mtrr_type;
+ u8 mtrr_type, uniform;
- mtrr_type = mtrr_type_lookup(start, end);
+ mtrr_type = mtrr_type_lookup(start, end, &uniform);
if (mtrr_type != MTRR_TYPE_WRBACK)
return _PAGE_CACHE_MODE_UC_MINUS;
@@ -328,9 +413,14 @@ static int pat_pagerange_is_ram(resource
/*
* For RAM pages, we use page flags to mark the pages with appropriate type.
- * Here we do two pass:
- * - Find the memtype of all the pages in the range, look for any conflicts
- * - In case of no conflicts, set the new memtype for pages in the range
+ * The page flags are limited to four types, WB (default), WC, WT and UC-.
+ * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting
+ * a new memory type is only allowed for a page mapped with the default WB
+ * type.
+ *
+ * Here we do two passes:
+ * - Find the memtype of all the pages in the range, look for any conflicts.
+ * - In case of no conflicts, set the new memtype for pages in the range.
*/
static int reserve_ram_pages_type(u64 start, u64 end,
enum page_cache_mode req_type,
@@ -339,6 +429,12 @@ static int reserve_ram_pages_type(u64 st
struct page *page;
unsigned long mfn;
+ if (req_type == _PAGE_CACHE_MODE_WP) {
+ if (new_type)
+ *new_type = _PAGE_CACHE_MODE_UC_MINUS;
+ return -EINVAL;
+ }
+
if (req_type == _PAGE_CACHE_MODE_UC) {
/* We do not support strong UC */
WARN_ON_ONCE(1);
@@ -352,8 +448,8 @@ static int reserve_ram_pages_type(u64 st
BUG_ON(!pfn_valid(pfn));
page = pfn_to_page(pfn);
type = get_page_memtype(page);
- if (type != -1) {
- pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
+ if (type != _PAGE_CACHE_MODE_WB) {
+ pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
start, end - 1, type, req_type);
if (new_type)
*new_type = type;
@@ -382,7 +478,7 @@ static int free_ram_pages_type(u64 start
BUG_ON(!pfn_valid(pfn));
page = pfn_to_page(pfn);
- set_page_memtype(page, -1);
+ set_page_memtype(page, _PAGE_CACHE_MODE_WB);
}
return 0;
}
@@ -393,6 +489,7 @@ static int free_ram_pages_type(u64 start
* - _PAGE_CACHE_MODE_WC
* - _PAGE_CACHE_MODE_UC_MINUS
* - _PAGE_CACHE_MODE_UC
+ * - _PAGE_CACHE_MODE_WT
*
* If new_type is NULL, function will return an error if it cannot reserve the
* region with req_type. If new_type is non-NULL, function will return
@@ -409,14 +506,10 @@ int reserve_memtype(u64 start, u64 end,
BUG_ON(start >= end); /* end is exclusive */
- if (!pat_enabled) {
+ if (!pat_enabled()) {
/* This is identical to page table setting without PAT */
- if (new_type) {
- if (req_type == _PAGE_CACHE_MODE_WC)
- *new_type = _PAGE_CACHE_MODE_UC_MINUS;
- else
- *new_type = req_type;
- }
+ if (new_type)
+ *new_type = req_type;
return 0;
}
@@ -460,9 +553,9 @@ int reserve_memtype(u64 start, u64 end,
err = rbt_memtype_check_insert(new, new_type);
if (err) {
- printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
- start, end - 1,
- cattr_name(new->type), cattr_name(req_type));
+ pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+ start, end - 1,
+ cattr_name(new->type), cattr_name(req_type));
kfree(new);
spin_unlock(&memtype_lock);
@@ -484,7 +577,7 @@ int free_memtype(u64 start, u64 end)
int is_range_ram;
struct memtype *entry;
- if (!pat_enabled)
+ if (!pat_enabled())
return 0;
/* Low ISA region is always mapped WB. No need to track */
@@ -506,8 +599,8 @@ int free_memtype(u64 start, u64 end)
spin_unlock(&memtype_lock);
if (!entry) {
- printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
- current->comm, current->pid, start, end - 1);
+ pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+ current->comm, current->pid, start, end - 1);
return -EINVAL;
}
@@ -527,7 +620,7 @@ int free_memtype(u64 start, u64 end)
* Only to be called when PAT is enabled
*
* Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
- * or _PAGE_CACHE_MODE_UC
+ * or _PAGE_CACHE_MODE_WT.
*/
static enum page_cache_mode lookup_memtype(u64 paddr)
{
@@ -539,16 +632,9 @@ static enum page_cache_mode lookup_memty
if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
struct page *page;
- page = pfn_to_page(paddr >> PAGE_SHIFT);
- rettype = get_page_memtype(page);
- /*
- * -1 from get_page_memtype() implies RAM page is in its
- * default state and not reserved, and hence of type WB
- */
- if (rettype == -1)
- rettype = _PAGE_CACHE_MODE_WB;
- return rettype;
+ page = pfn_to_page(paddr >> PAGE_SHIFT);
+ return get_page_memtype(page);
}
spin_lock(&memtype_lock);
@@ -634,13 +720,13 @@ static inline int range_is_allowed(unsig
u64 to = from + size;
u64 cursor = from;
- if (!pat_enabled)
+ if (!pat_enabled())
return 1;
while (cursor < to) {
if (!devmem_is_allowed(mfn)) {
- printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
- current->comm, from, to - 1);
+ pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
+ current->comm, from, to - 1);
return 0;
}
cursor += PAGE_SIZE;
@@ -671,7 +757,7 @@ int phys_mem_access_prot_allowed(struct
* caching for the high addresses through the KEN pin, but
* we maintain the tradition of paranoia in this code.
*/
- if (!pat_enabled &&
+ if (!pat_enabled() &&
!(boot_cpu_has(X86_FEATURE_MTRR) ||
boot_cpu_has(X86_FEATURE_K6_MTRR) ||
boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
@@ -719,12 +805,12 @@ static int reserve_pfn_range(u64 paddr,
* the type requested matches the type of first page in the range.
*/
if (is_ram) {
- if (!pat_enabled)
+ if (!pat_enabled())
return 0;
pcm = lookup_memtype(paddr);
if (want_pcm != pcm) {
- printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
+ pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
cattr_name(want_pcm),
(unsigned long long)paddr,
@@ -745,13 +831,12 @@ static int reserve_pfn_range(u64 paddr,
if (strict_prot ||
!is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
free_memtype(paddr, paddr + size);
- printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
- " for [mem %#010Lx-%#010Lx], got %s\n",
- current->comm, current->pid,
- cattr_name(want_pcm),
- (unsigned long long)paddr,
- (unsigned long long)(paddr + size - 1),
- cattr_name(pcm));
+ pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_pcm),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size - 1),
+ cattr_name(pcm));
return -EINVAL;
}
/*
@@ -834,7 +919,7 @@ int track_pfn_remap(struct vm_area_struc
return ret;
}
- if (!pat_enabled)
+ if (!pat_enabled())
return 0;
/*
@@ -862,7 +947,7 @@ int track_pfn_insert(struct vm_area_stru
{
enum page_cache_mode pcm;
- if (!pat_enabled)
+ if (!pat_enabled())
return 0;
/* Set prot based on lookup */
@@ -904,14 +989,18 @@ void untrack_pfn(struct vm_area_struct *
pgprot_t pgprot_writecombine(pgprot_t prot)
{
- if (pat_enabled)
- return __pgprot(pgprot_val(prot) |
+ return __pgprot(pgprot_val(prot) |
cachemode2protval(_PAGE_CACHE_MODE_WC));
- else
- return pgprot_noncached(prot);
}
EXPORT_SYMBOL_GPL(pgprot_writecombine);
+pgprot_t pgprot_writethrough(pgprot_t prot)
+{
+ return __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WT));
+}
+EXPORT_SYMBOL_GPL(pgprot_writethrough);
+
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
static struct memtype *memtype_get_idx(loff_t pos)
@@ -987,7 +1076,7 @@ static const struct file_operations memt
static int __init pat_memtype_list_init(void)
{
- if (pat_enabled) {
+ if (pat_enabled()) {
debugfs_create_file("pat_memtype_list", S_IRUSR,
arch_debugfs_dir, NULL, &memtype_fops);
}
--- a/arch/x86/mm/pgtable-xen.c
+++ b/arch/x86/mm/pgtable-xen.c
@@ -994,16 +994,31 @@ void xen_set_fixmap(enum fixed_addresses
}
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+/**
+ * pud_set_huge - setup kernel PUD mapping
+ *
+ * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
+ * function sets up a huge page only if any of the following conditions are met:
+ *
+ * - MTRRs are disabled, or
+ *
+ * - MTRRs are enabled and the range is completely covered by a single MTRR, or
+ *
+ * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
+ * has no effect on the requested PAT memory type.
+ *
+ * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
+ * page mapping attempt fails.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
- u8 mtrr;
+ u8 mtrr, uniform;
- /*
- * Do not use a huge page when the range is covered by non-WB type
- * of MTRRs.
- */
- mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
- if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+ mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
+ if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+ (mtrr != MTRR_TYPE_WRBACK))
return 0;
prot = pgprot_4k_2_large(prot);
@@ -1015,17 +1030,24 @@ int pud_set_huge(pud_t *pud, phys_addr_t
return 1;
}
+/**
+ * pmd_set_huge - setup kernel PMD mapping
+ *
+ * See text over pud_set_huge() above.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
- u8 mtrr;
+ u8 mtrr, uniform;
- /*
- * Do not use a huge page when the range is covered by non-WB type
- * of MTRRs.
- */
- mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
- if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+ mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
+ if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+ (mtrr != MTRR_TYPE_WRBACK)) {
+ pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
+ __func__, addr, addr + PMD_SIZE);
return 0;
+ }
prot = pgprot_4k_2_large(prot);
@@ -1036,6 +1058,11 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t
return 1;
}
+/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
int pud_clear_huge(pud_t *pud)
{
if (pud_large(*pud)) {
@@ -1046,6 +1073,11 @@ int pud_clear_huge(pud_t *pud)
return 0;
}
+/**
+ * pmd_clear_huge - clear kernel PMD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PMD map is found).
+ */
int pmd_clear_huge(pmd_t *pmd)
{
if (pmd_large(*pmd)) {
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -285,11 +285,12 @@ static int acpi_processor_get_info(struc
*/
if (invalid_logical_cpuid(pr->id)) {
int ret = acpi_processor_hotadd_init(pr);
- if (ret && (ret != -ENODEV || pr->phys_id == -1))
+ if (ret && (ret != -ENODEV || invalid_phys_cpuid(pr->phys_id)))
return ret;
}
#if defined(CONFIG_SMP) && defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
- if (pr->id >= setup_max_cpus && pr->id > 0)
+ if (pr->id >= setup_max_cpus && pr->id
+ && !invalid_logical_cpuid(pr->id))
pr->id = -1;
#endif
@@ -302,7 +303,7 @@ static int acpi_processor_get_info(struc
* generated as the following format:
* CPU+CPU ID.
*/
- if (pr->id != -1)
+ if (!invalid_logical_cpuid(pr->id))
sprintf(acpi_device_bid(device), "CPU%X", pr->id);
else
snprintf(acpi_device_bid(device),
@@ -341,7 +342,7 @@ static int acpi_processor_get_info(struc
* of /proc/cpuinfo
*/
status = acpi_evaluate_integer(pr->handle, "_SUN", NULL, &value);
- if (ACPI_SUCCESS(status) && pr->id != -1)
+ if (ACPI_SUCCESS(status) && !invalid_logical_cpuid(pr->id))
arch_fix_phys_package_id(pr->id, value);
return 0;
@@ -385,14 +386,14 @@ static int acpi_processor_add(struct acp
result = acpi_processor_get_info(device);
if (result || /* Processor is not physically present or unavailable */
- ((pr->id == -1) && !processor_cntl_external()))
+ (invalid_logical_cpuid(pr->id) && !processor_cntl_external()))
return 0;
#ifdef CONFIG_SMP
if (pr->id >= setup_max_cpus && pr->id != 0) {
if (!processor_cntl_external())
return 0;
- WARN_ON(pr->id != -1);
+ WARN_ON(!invalid_logical_cpuid(pr->id));
}
#endif
@@ -441,7 +442,7 @@ static int acpi_processor_add(struct acp
#ifndef CONFIG_XEN
per_cpu(processor_device_array, pr->id) = device;
#else
- if (pr->id != -1) {
+ if (!invalid_logical_cpuid(pr->id)) {
#endif
per_cpu(processors, pr->id) = pr;
@@ -471,7 +472,7 @@ static int acpi_processor_add(struct acp
free_cpumask_var(pr->throttling.shared_cpu_map);
device->driver_data = NULL;
#ifdef CONFIG_XEN
- if (pr->id != -1)
+ if (!invalid_logical_cpuid(pr->id))
#endif
per_cpu(processors, pr->id) = NULL;
err_free_pr:
@@ -503,7 +504,7 @@ static void acpi_processor_remove(struct
* Unbind the driver from the processor device and detach it from the
* ACPI companion object.
*/
- if (pr->id != -1) {
+ if (!invalid_logical_cpuid(pr->id)) {
device_release_driver(pr->dev);
acpi_unbind_one(pr->dev);
}
@@ -517,7 +518,7 @@ static void acpi_processor_remove(struct
mutex_lock(&processor_device_mutex);
radix_tree_delete(&processor_device_tree, pr->acpi_id);
mutex_unlock(&processor_device_mutex);
- if (pr->id != -1)
+ if (!invalid_logical_cpuid(pr->id))
per_cpu(processors, pr->id) = NULL;
goto out;
#else
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -168,12 +168,12 @@ static int acpi_pss_perf_init(struct acp
acpi_processor_ppc_has_changed(pr, 0);
/*
- * pr->id may equal to -1 while processor_cntl_external enabled.
+ * pr->id may be invalid when processor_cntl_external is enabled.
* throttle and thermal module don't support this case.
* Tx only works when dom0 vcpu == pcpu num by far, as we give
* control to dom0.
*/
- if (pr->id != -1) {
+ if (!invalid_logical_cpuid(pr->id)) {
acpi_processor_get_throttling_info(pr);
if (pr->flags.throttling)
--- a/drivers/pci/msi-xen.c
+++ b/drivers/pci/msi-xen.c
@@ -58,27 +58,6 @@ struct msi_dev_list {
/* Arch hooks */
-static void msi_set_enable(struct pci_dev *dev, int enable)
-{
- u16 control;
-
- pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
- control &= ~PCI_MSI_FLAGS_ENABLE;
- if (enable)
- control |= PCI_MSI_FLAGS_ENABLE;
- pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
-}
-
-static void msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u16 set)
-{
- u16 ctrl;
-
- pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
- ctrl &= ~clear;
- ctrl |= set;
- pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, ctrl);
-}
-
static int (*get_owner)(struct pci_dev *dev);
static domid_t msi_get_dev_owner(struct pci_dev *dev)
@@ -326,11 +305,11 @@ void pci_restore_msi_state(struct pci_de
pci_intx_for_msi(dev, 0);
if (dev->msi_enabled)
- msi_set_enable(dev, 0);
+ pci_msi_set_enable(dev, 0);
if (dev->msix_enabled)
- msix_clear_and_set_ctrl(dev, 0,
- PCI_MSIX_FLAGS_ENABLE |
- PCI_MSIX_FLAGS_MASKALL);
+ pci_msix_clear_and_set_ctrl(dev, 0,
+ PCI_MSIX_FLAGS_ENABLE |
+ PCI_MSIX_FLAGS_MASKALL);
if (pci_seg_supported) {
struct physdev_pci_device restore = {
@@ -356,7 +335,7 @@ void pci_restore_msi_state(struct pci_de
WARN(rc && rc != -ENOSYS, "restore_msi -> %d\n", rc);
if (dev->msix_enabled)
- msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
+ pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
}
EXPORT_SYMBOL_GPL(pci_restore_msi_state);
@@ -555,7 +534,7 @@ static int msi_capability_init(struct pc
struct msi_dev_list *dev_entry = get_msi_dev_pirq_list(dev);
int pirq;
- msi_set_enable(dev, 0); /* Disable MSI during set up */
+ pci_msi_set_enable(dev, 0); /* Disable MSI during set up */
pirq = msi_map_vector(dev, nvec, 0, dev_entry->owner);
if (pirq < 0)
@@ -564,7 +543,7 @@ static int msi_capability_init(struct pc
/* Set MSI enabled bits */
pci_intx_for_msi(dev, 0);
- msi_set_enable(dev, 1);
+ pci_msi_set_enable(dev, 1);
dev->msi_enabled = 1;
dev->irq = dev_entry->e.pirq = pirq;
@@ -594,7 +573,7 @@ static int msix_capability_init(struct p
return -ENOMEM;
/* Ensure MSI-X is disabled while it is set up */
- msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+ pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
table_base = find_table_base(dev);
if (!table_base)
@@ -605,7 +584,7 @@ static int msix_capability_init(struct p
* MSI-X registers. We need to mask all the vectors to prevent
* interrupts coming in before they're fully set up.
*/
- msix_clear_and_set_ctrl(dev, 0,
+ pci_msix_clear_and_set_ctrl(dev, 0,
PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE);
for (i = 0; i < nvec; i++) {
@@ -654,7 +633,7 @@ static int msix_capability_init(struct p
dev->msix_enabled = 1;
populate_msi_sysfs(dev);
- msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
+ pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
return 0;
}
@@ -755,7 +734,7 @@ void pci_msi_shutdown(struct pci_dev *de
msi_dev_entry->owner = DOMID_IO;
/* Disable MSI mode */
- msi_set_enable(dev, 0);
+ pci_msi_set_enable(dev, 0);
pci_intx_for_msi(dev, 1);
dev->msi_enabled = 0;
}
@@ -928,7 +907,7 @@ void pci_msix_shutdown(struct pci_dev *d
/* Disable MSI mode */
if (is_initial_xendomain()) {
- msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+ pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
pci_intx_for_msi(dev, 1);
}
dev->msix_enabled = 0;
@@ -960,26 +939,6 @@ EXPORT_SYMBOL(pci_msi_enabled);
void pci_msi_init_pci_dev(struct pci_dev *dev)
{
INIT_LIST_HEAD(&dev->msi_list);
-
- /* Disable the msi hardware to avoid screaming interrupts
- * during boot. This is the power on reset default so
- * usually this should be a noop.
- * But on a Xen host don't do this for
- * - IOMMUs which the hypervisor is in control of (and hence has
- * already enabled on purpose),
- * - unprivileged domains.
- */
- if (is_initial_xendomain()
- && (dev->class >> 8) == PCI_CLASS_SYSTEM_IOMMU
- && dev->vendor == PCI_VENDOR_ID_AMD)
- return;
- dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
- if (dev->msi_cap && is_initial_xendomain())
- msi_set_enable(dev, 0);
-
- dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
- if (dev->msix_cap && is_initial_xendomain())
- msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
}
/**
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <linux/cpumask.h>
#include <linux/pci-aspm.h>
+#include <xen/xen.h>
#include <asm-generic/pci-bridge.h>
#include "pci.h"
@@ -1148,13 +1149,22 @@ void pci_msi_setup_pci_dev(struct pci_de
* Disable the MSI hardware to avoid screaming interrupts
* during boot. This is the power on reset default so
* usually this should be a noop.
+ *
+ * But on a Xen host don't do this for
+ * - IOMMUs which the hypervisor is in control of (and hence has
+ * already enabled on purpose),
+ * - unprivileged domains.
*/
+ if (xen_initial_domain()
+ && (dev->class >> 8) == PCI_CLASS_SYSTEM_IOMMU
+ && dev->vendor == PCI_VENDOR_ID_AMD)
+ return;
dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
- if (dev->msi_cap)
+ if (dev->msi_cap && xen_initial_domain())
pci_msi_set_enable(dev, 0);
dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
- if (dev->msix_cap)
+ if (dev->msix_cap && xen_initial_domain())
pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
}
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -327,11 +327,9 @@ config XEN_SMPBOOT
config XEN_SPINLOCK_ACQUIRE_NESTING
int "maximum nesting level for acquiring spin locks"
- depends on SMP
+ depends on SMP && !QUEUED_SPINLOCKS
# Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
- depends on !XEN_COMPAT_030002_AND_LATER
- depends on !XEN_COMPAT_030004_AND_LATER
- depends on !XEN_COMPAT_030100_AND_LATER
+ depends on XEN_COMPAT >= 0x030200
range 0 3
default 0
help
--- a/drivers/xen/core/evtchn.c
+++ b/drivers/xen/core/evtchn.c
@@ -1261,10 +1261,82 @@ void irq_resume(void)
}
#endif
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+#include <asm/irqdomain.h>
+
+struct irq_domain *xen_irq_domain;
+
+static void xen_free_irqs(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ unsigned int i;
+
+ /* XXX x86_vector_free_irqs() uses x86_vector_domain here. */
+ WARN_ON_ONCE(domain != xen_irq_domain);
+ for (i = 0; i < nr_irqs; i++) {
+ struct irq_data *data;
+
+ data = irq_domain_get_irq_data(domain, virq + i);
+ if (data && data->chip_data) {
+ if (virq + i >= ARRAY_SIZE(_irq_cfg))
+ kfree(data->chip_data);
+ irq_domain_reset_irq_data(data);
+ }
+ }
+}
+
+static int xen_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ unsigned int i;
+#ifdef CONFIG_X86
+ struct irq_alloc_info *info = arg;
+
+ /* Currently this allocator can't guarantee contiguous allocations. */
+ if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
+ return -ENOSYS;
+#endif
+
+ for (i = 0; i < nr_irqs; i++) {
+ struct irq_data *data;
+ struct irq_cfg *cfg;
+
+ data = irq_domain_get_irq_data(domain, virq + i);
+ BUG_ON(!data);
+
+ if (virq + i < ARRAY_SIZE(_irq_cfg))
+ cfg = _irq_cfg + virq + i;
+ else
+ cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+ if (!cfg) {
+ xen_free_irqs(domain, virq, i);
+ return -ENOMEM;
+ }
+
+ data->chip_data = cfg;
+ data->hwirq = virq + i;
+ }
+
+ return 0;
+}
+
+static const struct irq_domain_ops xen_irq_domain_ops = {
+ .alloc = xen_alloc_irqs,
+ .free = xen_free_irqs,
+};
+#endif
+
int __init arch_early_irq_init(void)
{
unsigned int i;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ xen_irq_domain = irq_domain_add_tree(NULL, &xen_irq_domain_ops,
+ NULL);
+ BUG_ON(xen_irq_domain == NULL);
+ irq_set_default_host(xen_irq_domain);
+#endif
+
for (i = 0; i < ARRAY_SIZE(_irq_cfg); i++)
irq_set_chip_data(i, _irq_cfg + i);
@@ -1273,6 +1345,12 @@ int __init arch_early_irq_init(void)
struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
{
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ struct irq_alloc_info info = {};
+ int res = __irq_domain_alloc_irqs(NULL, at, 1, node, &info, false);
+
+ return res >= 0 || res == -EEXIST ? irq_cfg(at) : NULL;
+#else
int res = irq_alloc_desc_at(at, node);
struct irq_cfg *cfg = NULL;
@@ -1299,7 +1377,8 @@ struct irq_cfg *alloc_irq_and_cfg_at(uns
return cfg;
#else
return irq_cfg(at);
-#endif
+#endif /* CONFIG_SPARSE_IRQ */
+#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
}
#ifdef CONFIG_SPARSE_IRQ
--- a/drivers/xen/core/gnttab.c
+++ b/drivers/xen/core/gnttab.c
@@ -37,6 +37,7 @@
#include <linux/mm.h>
#include <linux/seqlock.h>
#include <linux/timer.h>
+#include <linux/vmalloc.h>
#include <xen/interface/xen.h>
#include <xen/gnttab.h>
#include <asm/pgtable.h>
--- a/drivers/xen/core/smpboot.c
+++ b/drivers/xen/core/smpboot.c
@@ -16,6 +16,7 @@
#include <linux/percpu.h>
#include <linux/tick.h>
#include <asm/desc.h>
+#include <asm/proto.h>
#include <asm/pgalloc.h>
#include <xen/evtchn.h>
#include <xen/interface/vcpu.h>
@@ -27,7 +28,6 @@ extern void local_teardown_timer(unsigne
extern void hypervisor_callback(void);
extern void failsafe_callback(void);
-extern void system_call(void);
extern void smp_trap_init(trap_info_t *);
cpumask_var_t vcpu_initialized_mask;
@@ -246,7 +246,7 @@ static void cpu_initialize_context(unsig
ctxt.user_regs.fs = __KERNEL_PERCPU;
ctxt.user_regs.gs = __KERNEL_STACK_CANARY;
#else /* __x86_64__ */
- ctxt.syscall_callback_eip = (unsigned long)system_call;
+ ctxt.syscall_callback_eip = (unsigned long)entry_SYSCALL_64;
ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
@@ -390,12 +390,12 @@ int __cpu_up(unsigned int cpu, struct ta
#ifdef CONFIG_X86_64
clear_tsk_thread_flag(idle, TIF_FORK);
+ per_cpu(cpu_sp0, cpu) = (unsigned long)task_stack_page(idle) +
+ THREAD_SIZE;
#else
per_cpu(cpu_current_top_of_stack, cpu) =
(unsigned long)task_stack_page(idle) + THREAD_SIZE;
#endif
- per_cpu(kernel_stack, cpu) = (unsigned long)task_stack_page(idle) +
- THREAD_SIZE;
per_cpu(current_task, cpu) = idle;
cpu_initialize_context(cpu, idle->thread.sp0);
--- a/drivers/xen/pcifront/pci_op.c
+++ b/drivers/xen/pcifront/pci_op.c
@@ -5,10 +5,10 @@
*/
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/ktime.h>
#include <linux/pci.h>
#include <linux/spinlock.h>
#include <asm/bitops.h>
-#include <linux/time.h>
#include <xen/evtchn.h>
#include "pcifront.h"
@@ -173,7 +173,6 @@ static int do_pci_op(struct pcifront_dev
unsigned long irq_flags;
evtchn_port_t port = pdev->evtchn;
s64 ns, ns_timeout;
- struct timeval tv;
spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
@@ -190,8 +189,7 @@ static int do_pci_op(struct pcifront_dev
* (in the latter case we end up continually re-executing poll() with a
* timeout in the past). 1s difference gives plenty of slack for error.
*/
- do_gettimeofday(&tv);
- ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC;
+ ns_timeout = ktime_get_ns() + 2 * (s64)NSEC_PER_SEC;
clear_evtchn(port);
@@ -200,8 +198,7 @@ static int do_pci_op(struct pcifront_dev
if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ))
BUG();
clear_evtchn(port);
- do_gettimeofday(&tv);
- ns = timeval_to_ns(&tv);
+ ns = ktime_get_ns();
if (ns > ns_timeout) {
dev_err(&pdev->xdev->dev,
"pciback not responding!!!\n");
@@ -434,9 +431,15 @@ int pcifront_scan_root(struct pcifront_d
unsigned int domain, unsigned int bus)
{
struct pci_bus *b;
+ LIST_HEAD(resources);
struct pcifront_sd *sd;
struct pci_bus_entry *bus_entry;
int err = 0;
+ static struct resource busn_res = {
+ .start = 0,
+ .end = 255,
+ .flags = IORESOURCE_BUS,
+ };
#ifndef CONFIG_PCI_DOMAINS
if (domain != 0) {
@@ -457,14 +460,18 @@ int pcifront_scan_root(struct pcifront_d
err = -ENOMEM;
goto err_out;
}
+ pci_add_resource(&resources, &ioport_resource);
+ pci_add_resource(&resources, &iomem_resource);
+ pci_add_resource(&resources, &busn_res);
pcifront_init_sd(sd, domain, bus, pdev);
pci_lock_rescan_remove();
- b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
- &pcifront_bus_ops, sd);
+ b = pci_scan_root_bus(&pdev->xdev->dev, bus,
+ &pcifront_bus_ops, sd, &resources);
if (!b) {
pci_unlock_rescan_remove();
+ pci_free_resource_list(&resources);
dev_err(&pdev->xdev->dev,
"Error creating PCI Frontend Bus!\n");
err = -ENOMEM;
--- a/drivers/xen/sfc_netutil/accel_util.c
+++ b/drivers/xen/sfc_netutil/accel_util.c
@@ -25,6 +25,7 @@
#include <linux/slab.h>
#include <linux/if_ether.h>
#include <linux/module.h>
+#include <linux/vmalloc.h>
#include <asm/io.h>
#include <asm/pgtable.h>
#include <asm/hypercall.h>
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -15,6 +15,7 @@
#include <xen/events.h>
#include <asm/xen/pci.h>
#else
+#include <linux/vmalloc.h>
#include <xen/evtchn.h>
#endif
#include "pciback.h"
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -41,7 +41,7 @@
#include <linux/vmalloc.h>
#include <linux/export.h>
#include <asm/xen/hypervisor.h>
-#include <asm/xen/page.h>
+#include <xen/page.h>
#include <xen/interface/xen.h>
#include <xen/interface/event_channel.h>
#include <xen/balloon.h>
@@ -411,16 +411,16 @@ int xenbus_grant_ring(struct xenbus_devi
int i, j;
for (i = 0; i < nr_pages; i++) {
- unsigned long addr = (unsigned long)vaddr +
- (PAGE_SIZE * i);
err = gnttab_grant_foreign_access(dev->otherend_id,
- virt_to_mfn(addr), 0);
+ virt_to_mfn(vaddr), 0);
if (err < 0) {
xenbus_dev_fatal(dev, err,
"granting access to ring page");
goto fail;
}
grefs[i] = err;
+
+ vaddr = vaddr + PAGE_SIZE;
}
return 0;
@@ -845,8 +845,10 @@ static int xenbus_unmap_ring_vfree_hvm(s
rv = xenbus_unmap_ring(dev, node->handles, node->nr_handles,
addrs);
- if (!rv)
+ if (!rv) {
vunmap(vaddr);
+ free_xenballooned_pages(node->nr_handles, node->hvm.pages);
+ }
else
WARN(1, "Leaking %p, size %u page(s)\n", vaddr,
node->nr_handles);
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -1317,7 +1317,7 @@ static int xenbus_resume_cb(struct notif
int err = 0;
if (xen_hvm_domain()) {
- uint64_t v;
+ uint64_t v = 0;
err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
if (!err && v)
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -167,6 +167,8 @@ struct msi_controller {
struct msi_desc *desc);
void (*teardown_irq)(struct msi_controller *chip, unsigned int irq);
};
+#else /* CONFIG_XEN */
+struct msi_msg;
#endif /* CONFIG_XEN */
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
--- a/include/xen/evtchn.h
+++ b/include/xen/evtchn.h
@@ -61,6 +61,7 @@ struct irq_cfg {
};
struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
static inline int evtchn_make_refcounted(unsigned int evtchn) { return 0; }
+extern struct irq_domain *xen_irq_domain;
#endif
/*
--- a/kernel/configs/xen.config
+++ b/kernel/configs/xen.config
@@ -18,26 +18,26 @@ CONFIG_TTY=y
CONFIG_INET=y
CONFIG_BINFMT_ELF=y
# generic config
-CONFIG_XEN=y
+CONFIG_PARAVIRT_XEN=y
CONFIG_XEN_DOM0=y
# backend drivers
-CONFIG_XEN_BACKEND=y
-CONFIG_XEN_BLKDEV_BACKEND=m
-CONFIG_XEN_NETDEV_BACKEND=m
+CONFIG_PARAVIRT_XEN_BACKEND=y
+CONFIG_PARAVIRT_XEN_BLKDEV_BACKEND=m
+CONFIG_PARAVIRT_XEN_NETDEV_BACKEND=m
CONFIG_HVC_XEN=y
CONFIG_XEN_WDT=m
-CONFIG_XEN_SCSI_BACKEND=m
+CONFIG_PARAVIRT_XEN_SCSI_BACKEND=m
# frontend drivers
CONFIG_XEN_FBDEV_FRONTEND=m
CONFIG_HVC_XEN_FRONTEND=y
CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m
-CONFIG_XEN_SCSI_FRONTEND=m
+CONFIG_PARAVIRT_XEN_BLKDEV_FRONTEND=m
+CONFIG_PARAVIRT_XEN_NETDEV_FRONTEND=m
+CONFIG_PARAVIRT_XEN_SCSI_FRONTEND=m
# others
CONFIG_XEN_BALLOON=y
CONFIG_XEN_SCRUB_PAGES=y
CONFIG_XEN_DEV_EVTCHN=m
-CONFIG_XEN_BLKDEV_FRONTEND=m
-CONFIG_XEN_NETDEV_FRONTEND=m
CONFIG_XENFS=m
CONFIG_XEN_COMPAT_XENFS=y
CONFIG_XEN_SYS_HYPERVISOR=y
--- a/lib/swiotlb-xen.c
+++ b/lib/swiotlb-xen.c
@@ -26,6 +26,7 @@
#include <linux/iommu-helper.h>
#include <linux/highmem.h>
#include <linux/gfp.h>
+#include <linux/scatterlist.h>
#include <asm/io.h>
#include <asm/pci.h>