Blob Blame History Raw
From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: Linux: 4.2
Patch-mainline: Never, SUSE-Xen specific

 This patch contains the differences between 4.1 and 4.2.

Automatically created from "patch-4.2" by xen-port-patches.py
Acked-by: jbeulich@suse.com

--- a/arch/arm/include/asm/xen/hypervisor.h
+++ b/arch/arm/include/asm/xen/hypervisor.h
@@ -20,7 +20,7 @@ static inline enum paravirt_lazy_mode pa
 
 extern struct dma_map_ops *xen_dma_ops;
 
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 void __init xen_early_init(void);
 #else
 static inline void xen_early_init(void) { return; }
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -902,7 +902,7 @@ config NR_CPUS
 
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
-	depends on SMP
+	depends on SMP && !XEN
 	---help---
 	  SMT scheduler support improves the CPU scheduler's decision making
 	  when dealing with Intel Pentium 4 chips with HyperThreading at a
@@ -912,7 +912,7 @@ config SCHED_SMT
 config SCHED_MC
 	def_bool y
 	prompt "Multi-core scheduler support"
-	depends on SMP
+	depends on SMP && !XEN
 	---help---
 	  Multi-core scheduler support improves the CPU scheduler's decision
 	  making when dealing with multi-core CPU chips at a cost of slightly
@@ -955,7 +955,7 @@ config X86_LOCAL_APIC
 	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
 	depends on !XEN_UNPRIVILEGED_GUEST
 	select IRQ_DOMAIN_HIERARCHY
-	select PCI_MSI_IRQ_DOMAIN if PCI_MSI
+	select PCI_MSI_IRQ_DOMAIN if PCI_MSI && !XEN
 
 config X86_IO_APIC
 	def_bool y
@@ -1044,6 +1044,7 @@ config X86_LEGACY_VM86
 	bool "Legacy VM86 support"
 	default n
 	depends on X86_32
+	depends on !XEN || (XEN_COMPAT < 0x040300)
 	---help---
 	  This option allows user programs to put the CPU into V8086
 	  mode, which is an 80286-era approximation of 16-bit real mode.
--- a/arch/x86/configs/xen.config
+++ b/arch/x86/configs/xen.config
@@ -22,7 +22,7 @@ CONFIG_XEN_ACPI_PROCESSOR=m
 # x86 specific backend drivers
 CONFIG_XEN_PCIDEV_BACKEND=m
 # x86 specific frontend drivers
-CONFIG_XEN_PCIDEV_FRONTEND=m
+CONFIG_PARAVIRT_XEN_PCIDEV_FRONTEND=m
 # depends on MEMORY_HOTPLUG, arm64 doesn't enable this yet,
 # move to generic config if it ever does.
 CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y
--- a/arch/x86/entry/entry_32-xen.S
+++ b/arch/x86/entry/entry_32-xen.S
@@ -1,29 +1,18 @@
 /*
+ *  Copyright (C) 1991,1992  Linus Torvalds
  *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- * This also contains the timer-interrupt handler, as well as all interrupts
- * and faults that can result in a task-switch.
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after a timer-interrupt and after each system call.
- *
- * I changed all the .align's to 4 (16 byte alignment), as that's faster
- * on a 486.
+ * entry_32.S contains the system-call and low-level fault and trap handling routines.
  *
  * Stack layout in 'syscall_exit':
- * 	ptrace needs to have all regs on the stack.
- *	if the order here is changed, it needs to be
- *	updated in fork.c:copy_process, signal.c:do_signal,
+ *	ptrace needs to have all registers on the stack.
+ *	If the order here is changed, it needs to be
+ *	updated in fork.c:copy_process(), signal.c:do_signal(),
  *	ptrace.c and ptrace.h
  *
  *	 0(%esp) - %ebx
  *	 4(%esp) - %ecx
  *	 8(%esp) - %edx
- *       C(%esp) - %esi
+ *	 C(%esp) - %esi
  *	10(%esp) - %edi
  *	14(%esp) - %ebp
  *	18(%esp) - %eax
@@ -37,8 +26,6 @@
  *	38(%esp) - %eflags
  *	3C(%esp) - %oldesp
  *	40(%esp) - %oldss
- *
- * "current" is in register %ebx during any slow entries.
  */
 
 #include <linux/linkage.h>
@@ -50,7 +37,6 @@
 #include <asm/smp.h>
 #include <asm/page_types.h>
 #include <asm/percpu.h>
-#include <asm/dwarf2.h>
 #include <asm/processor-flags.h>
 #include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
@@ -63,11 +49,11 @@
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
 #define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE	   0x40000000
+#define __AUDIT_ARCH_LE		0x40000000
 
 #ifndef CONFIG_AUDITSYSCALL
-#define sysenter_audit	syscall_trace_entry
-#define sysexit_audit	syscall_exit_work
+# define sysenter_audit		syscall_trace_entry
+# define sysexit_audit		syscall_exit_work
 #endif
 
 	.section .entry.text, "ax"
@@ -89,16 +75,16 @@
 NMI_MASK	= 0x80000000
 
 #ifdef CONFIG_PREEMPT
-#define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+# define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
 #else
-#define preempt_stop(clobbers)
-#define resume_kernel		restore_all
+# define preempt_stop(clobbers)
+# define resume_kernel		restore_all
 #endif
 
 .macro TRACE_IRQS_IRET
 #ifdef CONFIG_TRACE_IRQFLAGS
-	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)     # interrupts off?
-	jz 1f
+	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)     # interrupts off?
+	jz	1f
 	TRACE_IRQS_ON
 1:
 #endif
@@ -117,11 +103,10 @@ NMI_MASK	= 0x80000000
 
  /* unfortunately push/pop can't be no-op */
 .macro PUSH_GS
-	pushl_cfi $0
+	pushl	$0
 .endm
 .macro POP_GS pop=0
-	addl $(4 + \pop), %esp
-	CFI_ADJUST_CFA_OFFSET -(4 + \pop)
+	addl	$(4 + \pop), %esp
 .endm
 .macro POP_GS_EX
 .endm
@@ -141,181 +126,119 @@ NMI_MASK	= 0x80000000
 #else	/* CONFIG_X86_32_LAZY_GS */
 
 .macro PUSH_GS
-	pushl_cfi %gs
-	/*CFI_REL_OFFSET gs, 0*/
+	pushl	%gs
 .endm
 
 .macro POP_GS pop=0
-98:	popl_cfi %gs
-	/*CFI_RESTORE gs*/
+98:	popl	%gs
   .if \pop <> 0
-	add $\pop, %esp
-	CFI_ADJUST_CFA_OFFSET -\pop
+	add	$\pop, %esp
   .endif
 .endm
 .macro POP_GS_EX
 .pushsection .fixup, "ax"
-99:	movl $0, (%esp)
-	jmp 98b
+99:	movl	$0, (%esp)
+	jmp	98b
 .popsection
-	_ASM_EXTABLE(98b,99b)
+	_ASM_EXTABLE(98b, 99b)
 .endm
 
 .macro PTGS_TO_GS
-98:	mov PT_GS(%esp), %gs
+98:	mov	PT_GS(%esp), %gs
 .endm
 .macro PTGS_TO_GS_EX
 .pushsection .fixup, "ax"
-99:	movl $0, PT_GS(%esp)
-	jmp 98b
+99:	movl	$0, PT_GS(%esp)
+	jmp	98b
 .popsection
-	_ASM_EXTABLE(98b,99b)
+	_ASM_EXTABLE(98b, 99b)
 .endm
 
 .macro GS_TO_REG reg
-	movl %gs, \reg
-	/*CFI_REGISTER gs, \reg*/
+	movl	%gs, \reg
 .endm
 .macro REG_TO_PTGS reg
-	movl \reg, PT_GS(%esp)
-	/*CFI_REL_OFFSET gs, PT_GS*/
+	movl	\reg, PT_GS(%esp)
 .endm
 .macro SET_KERNEL_GS reg
-	movl $(__KERNEL_STACK_CANARY), \reg
-	movl \reg, %gs
+	movl	$(__KERNEL_STACK_CANARY), \reg
+	movl	\reg, %gs
 .endm
 
-#endif	/* CONFIG_X86_32_LAZY_GS */
+#endif /* CONFIG_X86_32_LAZY_GS */
 
 .macro SAVE_ALL
 	cld
 	PUSH_GS
-	pushl_cfi %fs
-	/*CFI_REL_OFFSET fs, 0;*/
-	pushl_cfi %es
-	/*CFI_REL_OFFSET es, 0;*/
-	pushl_cfi %ds
-	/*CFI_REL_OFFSET ds, 0;*/
-	pushl_cfi %eax
-	CFI_REL_OFFSET eax, 0
-	pushl_cfi %ebp
-	CFI_REL_OFFSET ebp, 0
-	pushl_cfi %edi
-	CFI_REL_OFFSET edi, 0
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %edx
-	CFI_REL_OFFSET edx, 0
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
-	movl $(__USER_DS), %edx
-	movl %edx, %ds
-	movl %edx, %es
-	movl $(__KERNEL_PERCPU), %edx
-	movl %edx, %fs
+	pushl	%fs
+	pushl	%es
+	pushl	%ds
+	pushl	%eax
+	pushl	%ebp
+	pushl	%edi
+	pushl	%esi
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
+	movl	$(__USER_DS), %edx
+	movl	%edx, %ds
+	movl	%edx, %es
+	movl	$(__KERNEL_PERCPU), %edx
+	movl	%edx, %fs
 	SET_KERNEL_GS %edx
 .endm
 
 .macro RESTORE_INT_REGS
-	popl_cfi %ebx
-	CFI_RESTORE ebx
-	popl_cfi %ecx
-	CFI_RESTORE ecx
-	popl_cfi %edx
-	CFI_RESTORE edx
-	popl_cfi %esi
-	CFI_RESTORE esi
-	popl_cfi %edi
-	CFI_RESTORE edi
-	popl_cfi %ebp
-	CFI_RESTORE ebp
-	popl_cfi %eax
-	CFI_RESTORE eax
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	popl	%eax
 .endm
 
 .macro RESTORE_REGS pop=0
 	RESTORE_INT_REGS
-1:	popl_cfi %ds
-	/*CFI_RESTORE ds;*/
-2:	popl_cfi %es
-	/*CFI_RESTORE es;*/
-3:	popl_cfi %fs
-	/*CFI_RESTORE fs;*/
+1:	popl	%ds
+2:	popl	%es
+3:	popl	%fs
 	POP_GS \pop
 .pushsection .fixup, "ax"
-4:	movl $0, (%esp)
-	jmp 1b
-5:	movl $0, (%esp)
-	jmp 2b
-6:	movl $0, (%esp)
-	jmp 3b
+4:	movl	$0, (%esp)
+	jmp	1b
+5:	movl	$0, (%esp)
+	jmp	2b
+6:	movl	$0, (%esp)
+	jmp	3b
 .popsection
-	_ASM_EXTABLE(1b,4b)
-	_ASM_EXTABLE(2b,5b)
-	_ASM_EXTABLE(3b,6b)
+	_ASM_EXTABLE(1b, 4b)
+	_ASM_EXTABLE(2b, 5b)
+	_ASM_EXTABLE(3b, 6b)
 	POP_GS_EX
 .endm
 
-.macro RING0_INT_FRAME
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, 3*4
-	/*CFI_OFFSET cs, -2*4;*/
-	CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_EC_FRAME
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, 4*4
-	/*CFI_OFFSET cs, -2*4;*/
-	CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_PTREGS_FRAME
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
-	/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
-	CFI_OFFSET eip, PT_EIP-PT_OLDESP
-	/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
-	/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
-	CFI_OFFSET eax, PT_EAX-PT_OLDESP
-	CFI_OFFSET ebp, PT_EBP-PT_OLDESP
-	CFI_OFFSET edi, PT_EDI-PT_OLDESP
-	CFI_OFFSET esi, PT_ESI-PT_OLDESP
-	CFI_OFFSET edx, PT_EDX-PT_OLDESP
-	CFI_OFFSET ecx, PT_ECX-PT_OLDESP
-	CFI_OFFSET ebx, PT_EBX-PT_OLDESP
-.endm
-
 ENTRY(ret_from_fork)
-	CFI_STARTPROC
-	pushl_cfi %eax
-	call schedule_tail
+	pushl	%eax
+	call	schedule_tail
 	GET_THREAD_INFO(%ebp)
-	popl_cfi %eax
-	pushl_cfi $0x0202		# Reset kernel eflags
-	popfl_cfi
-	jmp syscall_exit
-	CFI_ENDPROC
+	popl	%eax
+	pushl	$0x0202				# Reset kernel eflags
+	popfl
+	jmp	syscall_exit
 END(ret_from_fork)
 
 ENTRY(ret_from_kernel_thread)
-	CFI_STARTPROC
-	pushl_cfi %eax
-	call schedule_tail
+	pushl	%eax
+	call	schedule_tail
 	GET_THREAD_INFO(%ebp)
-	popl_cfi %eax
-	pushl_cfi $0x0202		# Reset kernel eflags
-	popfl_cfi
-	movl PT_EBP(%esp),%eax
-	call *PT_EBX(%esp)
-	movl $0,PT_EAX(%esp)
-	jmp syscall_exit
-	CFI_ENDPROC
+	popl	%eax
+	pushl	$0x0202				# Reset kernel eflags
+	popfl
+	movl	PT_EBP(%esp), %eax
+	call	*PT_EBX(%esp)
+	movl	$0, PT_EAX(%esp)
+	jmp	syscall_exit
 ENDPROC(ret_from_kernel_thread)
 
 /*
@@ -327,76 +250,70 @@ ENDPROC(ret_from_kernel_thread)
 
 	# userspace resumption stub bypassing syscall exit tracing
 	ALIGN
-	RING0_PTREGS_FRAME
 ret_from_exception:
 	preempt_stop(CLBR_ANY)
 ret_from_intr:
 	GET_THREAD_INFO(%ebp)
 #ifdef CONFIG_VM86
-	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
-	movb PT_CS(%esp), %al
-	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS and CS
+	movb	PT_CS(%esp), %al
+	andl	$(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
 #else
 	/*
 	 * We can be coming here from child spawned by kernel_thread().
 	 */
-	movl PT_CS(%esp), %eax
-	andl $SEGMENT_RPL_MASK, %eax
+	movl	PT_CS(%esp), %eax
+	andl	$SEGMENT_RPL_MASK, %eax
 #endif
-	cmpl $USER_RPL, %eax
-	jb resume_kernel		# not returning to v8086 or userspace
+	cmpl	$USER_RPL, %eax
+	jb	resume_kernel			# not returning to v8086 or userspace
 
 ENTRY(resume_userspace)
 	LOCKDEP_SYS_EXIT
- 	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
-					# setting need_resched or sigpending
-					# between sampling and the iret
+	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
+						# setting need_resched or sigpending
+						# between sampling and the iret
 	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done on
-					# int/exception return?
-	jne work_pending
-	jmp restore_all
+	movl	TI_flags(%ebp), %ecx
+	andl	$_TIF_WORK_MASK, %ecx		# is there any work to be done on
+						# int/exception return?
+	jne	work_pending
+	jmp	restore_all
 END(ret_from_exception)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
 	DISABLE_INTERRUPTS(CLBR_ANY)
 need_resched:
-	cmpl $0,PER_CPU_VAR(__preempt_count)
-	jnz restore_all
-	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
-	jz restore_all
-	call preempt_schedule_irq
-	jmp need_resched
+	cmpl	$0, PER_CPU_VAR(__preempt_count)
+	jnz	restore_all
+	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)	# interrupts off (exception path) ?
+	jz	restore_all
+	call	preempt_schedule_irq
+	jmp	need_resched
 END(resume_kernel)
 #endif
-	CFI_ENDPROC
 
-/* SYSENTER_RETURN points to after the "sysenter" instruction in
-   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
+/*
+ * SYSENTER_RETURN points to after the SYSENTER instruction
+ * in the vsyscall page.  See vsyscall-sysentry.S, which defines
+ * the symbol.
+ */
 
-	# sysenter call handler stub
-ENTRY(ia32_sysenter_target)
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, 0
-	CFI_REGISTER esp, ebp
-	movl SYSENTER_stack_sp0(%esp),%esp
+	# SYSENTER  call handler stub
+ENTRY(entry_SYSENTER_32)
+	movl	SYSENTER_stack_sp0(%esp), %esp
 sysenter_past_esp:
 	/*
 	 * Interrupts are disabled here, but we can't trace it until
 	 * enough kernel state to call TRACE_IRQS_OFF can be called - but
 	 * we immediately enable interrupts at that point anyway.
 	 */
-	pushl_cfi $__USER_DS
-	/*CFI_REL_OFFSET ss, 0*/
-	pushl_cfi %ebp
-	CFI_REL_OFFSET esp, 0
-	pushfl_cfi
-	orl $X86_EFLAGS_IF, (%esp)
-	pushl_cfi $__USER_CS
-	/*CFI_REL_OFFSET cs, 0*/
+	pushl	$__USER_DS
+	pushl	%ebp
+	pushfl
+	orl	$X86_EFLAGS_IF, (%esp)
+	pushl	$__USER_CS
 	/*
 	 * Push current_thread_info()->sysenter_return to the stack.
 	 * A tiny bit of offset fixup is necessary: TI_sysenter_return
@@ -405,10 +322,9 @@ sysenter_past_esp:
 	 * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
 	 * and THREAD_SIZE takes us to the bottom.
 	 */
-	pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
-	CFI_REL_OFFSET eip, 0
+	pushl	((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
 
-	pushl_cfi %eax
+	pushl	%eax
 	SAVE_ALL
 	ENABLE_INTERRUPTS(CLBR_NONE)
 
@@ -416,174 +332,167 @@ sysenter_past_esp:
  * Load the potential sixth argument from user stack.
  * Careful about security.
  */
-	cmpl $__PAGE_OFFSET-3,%ebp
-	jae syscall_fault
+	cmpl	$__PAGE_OFFSET-3, %ebp
+	jae	syscall_fault
 	ASM_STAC
-1:	movl (%ebp),%ebp
+1:	movl	(%ebp), %ebp
 	ASM_CLAC
-	movl %ebp,PT_EBP(%esp)
-	_ASM_EXTABLE(1b,syscall_fault)
+	movl	%ebp, PT_EBP(%esp)
+	_ASM_EXTABLE(1b, syscall_fault)
 
 	GET_THREAD_INFO(%ebp)
 
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
-	jnz sysenter_audit
+	testl	$_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
+	jnz	sysenter_audit
 sysenter_do_call:
-	cmpl $(NR_syscalls), %eax
-	jae sysenter_badsys
-	call *sys_call_table(,%eax,4)
+	cmpl	$(NR_syscalls), %eax
+	jae	sysenter_badsys
+	call	*sys_call_table(, %eax, 4)
 sysenter_after_call:
-	movl %eax,PT_EAX(%esp)
+	movl	%eax, PT_EAX(%esp)
 	LOCKDEP_SYS_EXIT
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	testl $_TIF_ALLWORK_MASK, %ecx
-	jnz sysexit_audit
+	movl	TI_flags(%ebp), %ecx
+	testl	$_TIF_ALLWORK_MASK, %ecx
+	jnz	sysexit_audit
 sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
-	movl PT_EIP(%esp), %edx
-	movl PT_OLDESP(%esp), %ecx
-	xorl %ebp,%ebp
+	movl	PT_EIP(%esp), %edx
+	movl	PT_OLDESP(%esp), %ecx
+	xorl	%ebp, %ebp
 	TRACE_IRQS_ON
-1:	mov  PT_FS(%esp), %fs
+1:	mov	PT_FS(%esp), %fs
 	PTGS_TO_GS
 	ENABLE_INTERRUPTS_SYSEXIT
 
 #ifdef CONFIG_AUDITSYSCALL
 sysenter_audit:
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
-	jnz syscall_trace_entry
-	/* movl PT_EAX(%esp), %eax	already set, syscall number: 1st arg to audit */
-	movl PT_EBX(%esp), %edx		/* ebx/a0: 2nd arg to audit */
-	/* movl PT_ECX(%esp), %ecx	already set, a1: 3nd arg to audit */
-	pushl_cfi PT_ESI(%esp)		/* a3: 5th arg */
-	pushl_cfi PT_EDX+4(%esp)	/* a2: 4th arg */
-	call __audit_syscall_entry
-	popl_cfi %ecx /* get that remapped edx off the stack */
-	popl_cfi %ecx /* get that remapped esi off the stack */
-	movl PT_EAX(%esp),%eax		/* reload syscall number */
-	jmp sysenter_do_call
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp)
+	jnz	syscall_trace_entry
+	/* movl	PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
+	movl	PT_EBX(%esp), %edx		/* ebx/a0: 2nd arg to audit */
+	/* movl	PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
+	pushl	PT_ESI(%esp)			/* a3: 5th arg */
+	pushl	PT_EDX+4(%esp)			/* a2: 4th arg */
+	call	__audit_syscall_entry
+	popl	%ecx				/* get that remapped edx off the stack */
+	popl	%ecx				/* get that remapped esi off the stack */
+	movl	PT_EAX(%esp), %eax		/* reload syscall number */
+	jmp	sysenter_do_call
 
 sysexit_audit:
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-	jnz syscall_exit_work
+	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+	jnz	syscall_exit_work
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)
-	movl %eax,%edx		/* second arg, syscall return value */
-	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
-	setbe %al		/* 1 if so, 0 if not */
-	movzbl %al,%eax		/* zero-extend that */
-	call __audit_syscall_exit
+	movl	%eax, %edx			/* second arg, syscall return value */
+	cmpl	$-MAX_ERRNO, %eax		/* is it an error ? */
+	setbe %al				/* 1 if so, 0 if not */
+	movzbl %al, %eax			/* zero-extend that */
+	call	__audit_syscall_exit
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-	jnz syscall_exit_work
-	movl PT_EAX(%esp),%eax	/* reload syscall return value */
-	jmp sysenter_exit
+	movl	TI_flags(%ebp), %ecx
+	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+	jnz	syscall_exit_work
+	movl	PT_EAX(%esp), %eax		/* reload syscall return value */
+	jmp	sysenter_exit
 #endif
 
-	CFI_ENDPROC
-.pushsection .fixup,"ax"
-2:	movl $0,PT_FS(%esp)
-	jmp 1b
+.pushsection .fixup, "ax"
+2:	movl	$0, PT_FS(%esp)
+	jmp	1b
 .popsection
-	_ASM_EXTABLE(1b,2b)
+	_ASM_EXTABLE(1b, 2b)
 	PTGS_TO_GS_EX
-ENDPROC(ia32_sysenter_target)
+ENDPROC(entry_SYSENTER_32)
 
 	# pv sysenter call handler stub
-ENTRY(ia32pv_sysenter_target)
-	RING0_INT_FRAME
+ENTRY(entry_SYSENTER_PV32)
 	ASM_CLAC
-	movl $__USER_DS,16(%esp)
-	movl %ebp,12(%esp)
-	movl $__USER_CS,4(%esp)
-	addl $4,%esp
-	CFI_ADJUST_CFA_OFFSET -4
+	movl	$__USER_DS, 16(%esp)
+	movl	%ebp, 12(%esp)
+	movl	$__USER_CS, 4(%esp)
+	addl	$4, %esp
 	/* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */
-	pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+	pushl	TI_sysenter_return-THREAD_SIZE+8+4*4(%esp)
 /*
  * Load the potential sixth argument from user stack.
  * Careful about security.
  */
-	cmpl $__PAGE_OFFSET-3,%ebp
-	jae syscall_fault
+	cmpl	$__PAGE_OFFSET-3, %ebp
+	jae	syscall_fault
 	ASM_STAC
-1:	movl (%ebp),%ebp
+1:	movl	(%ebp), %ebp
 	ASM_CLAC
 	_ASM_EXTABLE(1b,syscall_fault)
-	jmp system_call
-	CFI_ENDPROC
-ENDPROC(ia32pv_sysenter_target)
+	jmp	entry_INT80_32
+ENDPROC(entry_SYSENTER_PV32)
 
 	# system call handler stub
-ENTRY(system_call)
-	RING0_INT_FRAME			# can't unwind into user space anyway
+ENTRY(entry_INT80_32)
 	ASM_CLAC
-	pushl_cfi %eax			# save orig_eax
+	pushl	%eax				# save orig_eax
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-					# system call tracing in operation / emulation
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
-	jnz syscall_trace_entry
-	cmpl $(NR_syscalls), %eax
-	jae syscall_badsys
+						# system call tracing in operation / emulation
+	testl	$_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
+	jnz	syscall_trace_entry
+	cmpl	$(NR_syscalls), %eax
+	jae	syscall_badsys
 syscall_call:
-	call *sys_call_table(,%eax,4)
+	call	*sys_call_table(, %eax, 4)
 syscall_after_call:
-	movl %eax,PT_EAX(%esp)		# store the return value
+	movl	%eax, PT_EAX(%esp)		# store the return value
 syscall_exit:
 	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
-					# setting need_resched or sigpending
-					# between sampling and the iret
+	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
+						# setting need_resched or sigpending
+						# between sampling and the iret
 	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	testl $_TIF_ALLWORK_MASK, %ecx	# current->work
-	jnz syscall_exit_work
+	movl	TI_flags(%ebp), %ecx
+	testl	$_TIF_ALLWORK_MASK, %ecx	# current->work
+	jnz	syscall_exit_work
 
 restore_all:
 	TRACE_IRQS_IRET
 restore_all_notrace:
 #ifdef CONFIG_X86_ESPFIX32
-	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS, SS and CS
-	# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
-	# are returning to the kernel.
-	# See comments in process.c:copy_thread() for details.
-	movb PT_OLDSS(%esp), %ah
-	movb PT_CS(%esp), %al
-	andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
-	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
-	CFI_REMEMBER_STATE
-	je ldt_ss			# returning to user-space with LDT SS
+	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
+	/*
+	 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+	 * are returning to the kernel.
+	 * See comments in process.c:copy_thread() for details.
+	 */
+	movb	PT_OLDSS(%esp), %ah
+	movb	PT_CS(%esp), %al
+	andl	$(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+	cmpl	$((SEGMENT_LDT << 8) | USER_RPL), %eax
+	je ldt_ss				# returning to user-space with LDT SS
 #endif
 restore_nocheck:
 #ifdef CONFIG_XEN
-	movl PT_EFLAGS(%esp), %eax
-	testl $(X86_EFLAGS_VM|NMI_MASK), %eax
-	CFI_REMEMBER_STATE
-	jnz hypervisor_iret
-	shr $9, %eax			# EAX[0] == IRET_EFLAGS.IF
+	movl	PT_EFLAGS(%esp), %eax
+	testl	$(X86_EFLAGS_VM|NMI_MASK), %eax
+	jnz	hypervisor_iret
+	shr	$9, %eax			# EAX[0] == IRET_EFLAGS.IF
 	GET_VCPU_INFO
-	andb evtchn_upcall_mask(%esi),%al
-	andb $1,%al			# EAX[0] == IRET_EFLAGS.IF & event_mask
-	CFI_REMEMBER_STATE
-	jnz restore_all_enable_events	#        != 0 => enable event delivery
+	andb	evtchn_upcall_mask(%esi),%al
+	andb	$1, %al				# EAX[0] == IRET_EFLAGS.IF & event_mask
+	jnz	restore_all_enable_events	#        != 0 => enable event delivery
 #endif
-	RESTORE_REGS 4			# skip orig_eax/error_code
+	RESTORE_REGS 4				# skip orig_eax/error_code
 irq_return:
 	INTERRUPT_RETURN
-.section .fixup,"ax"
-ENTRY(iret_exc)
-	pushl $0			# no error code
-	pushl $do_iret_error
-	jmp error_code
+.section .fixup, "ax"
+ENTRY(iret_exc	)
+	pushl	$0				# no error code
+	pushl	$do_iret_error
+	jmp	error_code
 .previous
-	_ASM_EXTABLE(irq_return,iret_exc)
+	_ASM_EXTABLE(irq_return, iret_exc)
 
-	CFI_RESTORE_STATE
 #ifdef CONFIG_X86_ESPFIX32
 ldt_ss:
 #ifdef CONFIG_PARAVIRT
@@ -595,8 +504,8 @@ ldt_ss:
 	 * is still available to implement the setting of the high
 	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
 	 */
-	cmpl $0, pv_info+PARAVIRT_enabled
-	jne restore_nocheck
+	cmpl	$0, pv_info+PARAVIRT_enabled
+	jne	restore_nocheck
 #endif
 
 /*
@@ -611,22 +520,23 @@ ldt_ss:
  * a base address that matches for the difference.
  */
 #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
-	mov %esp, %edx			/* load kernel esp */
-	mov PT_OLDESP(%esp), %eax	/* load userspace esp */
-	mov %dx, %ax			/* eax: new kernel esp */
-	sub %eax, %edx			/* offset (low word is 0) */
-	shr $16, %edx
-	mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
-	mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
-	pushl_cfi $__ESPFIX_SS
-	pushl_cfi %eax			/* new kernel esp */
-	/* Disable interrupts, but do not irqtrace this section: we
+	mov	%esp, %edx			/* load kernel esp */
+	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
+	mov	%dx, %ax			/* eax: new kernel esp */
+	sub	%eax, %edx			/* offset (low word is 0) */
+	shr	$16, %edx
+	mov	%dl, GDT_ESPFIX_SS + 4		/* bits 16..23 */
+	mov	%dh, GDT_ESPFIX_SS + 7		/* bits 24..31 */
+	pushl	$__ESPFIX_SS
+	pushl	%eax				/* new kernel esp */
+	/*
+	 * Disable interrupts, but do not irqtrace this section: we
 	 * will soon execute iret and the tracer was already set to
-	 * the irqstate after the iret */
+	 * the irqstate after the IRET:
+	 */
 	DISABLE_INTERRUPTS(CLBR_EAX)
-	lss (%esp), %esp		/* switch to espfix segment */
-	CFI_ADJUST_CFA_OFFSET -8
-	jmp restore_nocheck
+	lss	(%esp), %esp			/* switch to espfix segment */
+	jmp	restore_nocheck
 #endif
 #ifdef CONFIG_XEN
         ALIGN
@@ -635,120 +545,114 @@ restore_all_enable_events:
 	__ENABLE_INTERRUPTS
 .Lscrit: /**** START OF CRITICAL REGION ****/
 	__TEST_PENDING
-	jnz  14f			# process more events if necessary...
+	jnz	14f				# process more events if necessary...
 	RESTORE_REGS 4
 1:	INTERRUPT_RETURN
 	_ASM_EXTABLE(1b,iret_exc)
 14:	__DISABLE_INTERRUPTS
 .Lecrit: /**** END OF CRITICAL REGION ****/
 	TRACE_IRQS_OFF
-	jmp  .Ldo_upcall
+	jmp	.Ldo_upcall
 
-	CFI_RESTORE_STATE
 hypervisor_iret:
-	andl $~NMI_MASK, PT_EFLAGS(%esp)
+	andl	$~NMI_MASK, PT_EFLAGS(%esp)
 	RESTORE_REGS 4
-	jmp  hypercall_page + (__HYPERVISOR_iret * 32)
+	jmp	hypercall_page + (__HYPERVISOR_iret * 32)
 #endif
-	CFI_ENDPROC
-ENDPROC(system_call)
+ENDPROC(entry_INT80_32)
 
 	# perform work that needs to be done immediately before resumption
 	ALIGN
-	RING0_PTREGS_FRAME		# can't unwind into user space anyway
 work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
-	jz work_notifysig
+	testb	$_TIF_NEED_RESCHED, %cl
+	jz	work_notifysig
 work_resched:
-	call schedule
+	call	schedule
 	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
-					# setting need_resched or sigpending
-					# between sampling and the iret
+	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
+						# setting need_resched or sigpending
+						# between sampling and the iret
 	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
-					# than syscall tracing?
-	jz restore_all
-	testb $_TIF_NEED_RESCHED, %cl
-	jnz work_resched
+	movl	TI_flags(%ebp), %ecx
+	andl	$_TIF_WORK_MASK, %ecx		# is there any work to be done other
+						# than syscall tracing?
+	jz	restore_all
+	testb	$_TIF_NEED_RESCHED, %cl
+	jnz	work_resched
 
-work_notifysig:				# deal with pending signals and
-					# notify-resume requests
+work_notifysig:					# deal with pending signals and
+						# notify-resume requests
 #ifdef CONFIG_VM86
-	testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
-	movl %esp, %eax
-	jnz work_notifysig_v86		# returning to kernel-space or
-					# vm86-space
+	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esp)
+	movl	%esp, %eax
+	jnz	work_notifysig_v86		# returning to kernel-space or
+						# vm86-space
 1:
 #else
-	movl %esp, %eax
+	movl	%esp, %eax
 #endif
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	movb PT_CS(%esp), %bl
-	andb $SEGMENT_RPL_MASK, %bl
-	cmpb $USER_RPL, %bl
-	jb resume_kernel
-	xorl %edx, %edx
-	call do_notify_resume
-	jmp resume_userspace
+	movb	PT_CS(%esp), %bl
+	andb	$SEGMENT_RPL_MASK, %bl
+	cmpb	$USER_RPL, %bl
+	jb	resume_kernel
+	xorl	%edx, %edx
+	call	do_notify_resume
+	jmp	resume_userspace
 
 #ifdef CONFIG_VM86
 	ALIGN
 work_notifysig_v86:
-	pushl_cfi %ecx			# save ti_flags for do_notify_resume
-	call save_v86_state		# %eax contains pt_regs pointer
-	popl_cfi %ecx
-	movl %eax, %esp
-	jmp 1b
+	pushl	%ecx				# save ti_flags for do_notify_resume
+	call	save_v86_state			# %eax contains pt_regs pointer
+	popl	%ecx
+	movl	%eax, %esp
+	jmp	1b
 #endif
 END(work_pending)
 
 	# perform syscall exit tracing
 	ALIGN
 syscall_trace_entry:
-	movl $-ENOSYS,PT_EAX(%esp)
-	movl %esp, %eax
-	call syscall_trace_enter
+	movl	$-ENOSYS, PT_EAX(%esp)
+	movl	%esp, %eax
+	call	syscall_trace_enter
 	/* What it returned is what we'll actually use.  */
-	cmpl $(NR_syscalls), %eax
-	jnae syscall_call
-	jmp syscall_exit
+	cmpl	$(NR_syscalls), %eax
+	jnae	syscall_call
+	jmp	syscall_exit
 END(syscall_trace_entry)
 
 	# perform syscall exit tracing
 	ALIGN
 syscall_exit_work:
-	testl $_TIF_WORK_SYSCALL_EXIT, %ecx
-	jz work_pending
+	testl	$_TIF_WORK_SYSCALL_EXIT, %ecx
+	jz	work_pending
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)	# could let syscall_trace_leave() call
-					# schedule() instead
-	movl %esp, %eax
-	call syscall_trace_leave
-	jmp resume_userspace
+	ENABLE_INTERRUPTS(CLBR_ANY)		# could let syscall_trace_leave() call
+						# schedule() instead
+	movl	%esp, %eax
+	call	syscall_trace_leave
+	jmp	resume_userspace
 END(syscall_exit_work)
-	CFI_ENDPROC
 
-	RING0_INT_FRAME			# can't unwind into user space anyway
 syscall_fault:
 	ASM_CLAC
 	GET_THREAD_INFO(%ebp)
-	movl $-EFAULT,PT_EAX(%esp)
-	jmp resume_userspace
+	movl	$-EFAULT, PT_EAX(%esp)
+	jmp	resume_userspace
 END(syscall_fault)
 
 syscall_badsys:
-	movl $-ENOSYS,%eax
-	jmp syscall_after_call
+	movl	$-ENOSYS, %eax
+	jmp	syscall_after_call
 END(syscall_badsys)
 
 sysenter_badsys:
-	movl $-ENOSYS,%eax
-	jmp sysenter_after_call
+	movl	$-ENOSYS, %eax
+	jmp	sysenter_after_call
 END(sysenter_badsys)
-	CFI_ENDPROC
 
 #ifndef CONFIG_XEN
 .macro FIXUP_ESPFIX_STACK
@@ -761,25 +665,24 @@ END(sysenter_badsys)
  */
 #ifdef CONFIG_X86_ESPFIX32
 	/* fixup the stack */
-	mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
-	mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
-	shl $16, %eax
-	addl %esp, %eax			/* the adjusted stack pointer */
-	pushl_cfi $__KERNEL_DS
-	pushl_cfi %eax
-	lss (%esp), %esp		/* switch to the normal stack segment */
-	CFI_ADJUST_CFA_OFFSET -8
+	mov	GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+	mov	GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
+	shl	$16, %eax
+	addl	%esp, %eax			/* the adjusted stack pointer */
+	pushl	$__KERNEL_DS
+	pushl	%eax
+	lss	(%esp), %esp			/* switch to the normal stack segment */
 #endif
 .endm
 .macro UNWIND_ESPFIX_STACK
 #ifdef CONFIG_X86_ESPFIX32
-	movl %ss, %eax
+	movl	%ss, %eax
 	/* see if on espfix stack */
-	cmpw $__ESPFIX_SS, %ax
-	jne 27f
-	movl $__KERNEL_DS, %eax
-	movl %eax, %ds
-	movl %eax, %es
+	cmpw	$__ESPFIX_SS, %ax
+	jne	27f
+	movl	$__KERNEL_DS, %eax
+	movl	%eax, %ds
+	movl	%eax, %es
 	/* switch to normal stack */
 	FIXUP_ESPFIX_STACK
 27:
@@ -792,13 +695,11 @@ END(sysenter_badsys)
  */
 	.align 8
 ENTRY(irq_entries_start)
-	RING0_INT_FRAME
     vector=FIRST_EXTERNAL_VECTOR
     .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
-	pushl_cfi $(~vector+0x80)	/* Note: always in signed byte range */
+	pushl	$(~vector+0x80)			/* Note: always in signed byte range */
     vector=vector+1
 	jmp	common_interrupt
-	CFI_ADJUST_CFA_OFFSET -4
 	.align	8
     .endr
 END(irq_entries_start)
@@ -810,38 +711,34 @@ END(irq_entries_start)
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 common_interrupt:
 	ASM_CLAC
-	addl $-0x80,(%esp)	/* Adjust vector into the [-256,-1] range */
+	addl	$-0x80, (%esp)			/* Adjust vector into the [-256, -1] range */
 	SAVE_ALL
 	TRACE_IRQS_OFF
-	movl %esp,%eax
-	call do_IRQ
-	jmp ret_from_intr
+	movl	%esp, %eax
+	call	do_IRQ
+	jmp	ret_from_intr
 ENDPROC(common_interrupt)
-	CFI_ENDPROC
 
 #define BUILD_INTERRUPT3(name, nr, fn)	\
 ENTRY(name)				\
-	RING0_INT_FRAME;		\
 	ASM_CLAC;			\
-	pushl_cfi $~(nr);		\
+	pushl	$~(nr);			\
 	SAVE_ALL;			\
 	TRACE_IRQS_OFF			\
-	movl %esp,%eax;			\
-	call fn;			\
-	jmp ret_from_intr;		\
-	CFI_ENDPROC;			\
+	movl	%esp, %eax;		\
+	call	fn;			\
+	jmp	ret_from_intr;		\
 ENDPROC(name)
 
 
 #ifdef CONFIG_TRACING
-#define TRACE_BUILD_INTERRUPT(name, nr)		\
-	BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
+# define TRACE_BUILD_INTERRUPT(name, nr)	BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
 #else
-#define TRACE_BUILD_INTERRUPT(name, nr)
+# define TRACE_BUILD_INTERRUPT(name, nr)
 #endif
 
-#define BUILD_INTERRUPT(name, nr) \
-	BUILD_INTERRUPT3(name, nr, smp_##name); \
+#define BUILD_INTERRUPT(name, nr)		\
+	BUILD_INTERRUPT3(name, nr, smp_##name);	\
 	TRACE_BUILD_INTERRUPT(name, nr)
 
 /* The include is where all of the SMP etc. interrupts come from */
@@ -867,34 +764,31 @@ ENDPROC(name)
 # critical region we know that the entire frame is present and correct
 # so we can simply throw away the new one.
 ENTRY(hypervisor_callback)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $-1
+	pushl	$-1
 	SAVE_ALL
-	movl PT_CS(%esp),%ecx
-	movl PT_EIP(%esp),%eax
-	andl $SEGMENT_RPL_MASK,%ecx
-	cmpl $USER_RPL,%ecx
-	jae  .Ldo_upcall
-	cmpl $.Lscrit,%eax
-	jb   0f
-	cmpl $.Lecrit,%eax
-	jb   critical_region_fixup
+	movl	PT_CS(%esp), %ecx
+	movl	PT_EIP(%esp), %eax
+	andl	$SEGMENT_RPL_MASK, %ecx
+	cmpl	$USER_RPL, %ecx
+	jae	.Ldo_upcall
+	cmpl	$.Lscrit, %eax
+	jb	0f
+	cmpl	$.Lecrit, %eax
+	jb	critical_region_fixup
 0:
 #ifdef CONFIG_XEN_SUPERVISOR_MODE_KERNEL
-	cmpl $.Lsysexit_scrit,%eax
-	jb   .Ldo_upcall
-	cmpl $.Lsysexit_ecrit,%eax
-	ja   .Ldo_upcall
-	addl $PT_OLDESP,%esp		# Remove eflags...ebx from stack frame.
+	cmpl	$.Lsysexit_scrit, %eax
+	jb	.Ldo_upcall
+	cmpl	$.Lsysexit_ecrit, %eax
+	ja	.Ldo_upcall
+	addl	$PT_OLDESP, %esp		# Remove eflags...ebx from stack frame.
 #endif
 .Ldo_upcall:
-	pushl_cfi %esp
-	call evtchn_do_upcall
-	add  $4,%esp
-	CFI_ADJUST_CFA_OFFSET -4
-	jmp  ret_from_intr
-	CFI_ENDPROC
+	pushl	%esp
+	call	evtchn_do_upcall
+	add	$4, %esp
+	jmp	ret_from_intr
 
 # [How we do the fixup]. We want to merge the current stack frame with the
 # just-interrupted frame. How we do this depends on where in the critical
@@ -904,18 +798,18 @@ ENTRY(hypervisor_callback)
 # provides the number of bytes which have already been popped from the
 # interrupted stack frame.
 critical_region_fixup:
-	movsbl critical_fixup_table-.Lscrit(%eax),%ecx # %ecx contains num slots popped
-	testl %ecx,%ecx
-	leal (%esp,%ecx,4),%esi		# %esi points at end of src region
-	leal PT_OLDESP(%esp),%edi	# %edi points at end of dst region
-	jle   17f			# skip loop if nothing to copy
-16:	subl $4,%esi			# pre-decrementing copy loop
-	subl $4,%edi
-	movl (%esi),%eax
-	movl %eax,(%edi)
-	loop 16b
-17:	movl %edi,%esp			# final %edi is top of merged stack
-	jmp  .Ldo_upcall
+	movsbl	critical_fixup_table-.Lscrit(%eax),%ecx # %ecx contains num slots popped
+	testl	%ecx, %ecx
+	leal	(%esp,%ecx,4), %esi		# %esi points at end of src region
+	leal	PT_OLDESP(%esp), %edi		# %edi points at end of dst region
+	jle	17f				# skip loop if nothing to copy
+16:	subl	$4, %esi			# pre-decrementing copy loop
+	subl	$4, %edi
+	movl	(%esi), %eax
+	movl	%eax, (%edi)
+	loop	16b
+17:	movl	%edi, %esp			# final %edi is top of merged stack
+	jmp	.Ldo_upcall
 
 .section .rodata,"a"
 critical_fixup_table:
@@ -953,73 +847,65 @@ critical_fixup_table:
 # We distinguish between categories by maintaining a status value in EAX.
 ENTRY(failsafe_callback)
 	ASM_CLAC
-	pushl %eax
-	movl $1,%eax
-1:	mov 4(%esp),%ds
-2:	mov 8(%esp),%es
-3:	mov 12(%esp),%fs
-4:	mov 16(%esp),%gs
-	testl %eax,%eax
-	popl %eax
-	leal 16(%esp),%esp
-	RING0_INT_FRAME
-	jnz iret_exc		# EAX != 0 => Category 2 (Bad IRET)
-	pushl_cfi $-1		# EAX == 0 => Category 1 (Bad segment)
+	pushl	%eax
+	movl	$1, %eax
+1:	mov	4(%esp), %ds
+2:	mov	8(%esp), %es
+3:	mov	12(%esp), %fs
+4:	mov	16(%esp), %gs
+	testl	%eax, %eax
+	popl	%eax
+	leal	16(%esp), %esp
+	jnz	iret_exc		# EAX != 0 => Category 2 (Bad IRET)
+	pushl	$-1			# EAX == 0 => Category 1 (Bad segment)
 	SAVE_ALL
-	jmp ret_from_exception
-.section .fixup,"ax";		\
-6:	xorl %eax,%eax;		\
-	movl %eax,4(%esp);	\
-	jmp 1b;			\
-7:	xorl %eax,%eax;		\
-	movl %eax,8(%esp);	\
-	jmp 2b;			\
-8:	xorl %eax,%eax;		\
-	movl %eax,12(%esp);	\
-	jmp 3b;			\
-9:	xorl %eax,%eax;		\
-	movl %eax,16(%esp);	\
-	jmp 4b;			\
+	jmp	ret_from_exception
+.section .fixup,"ax"
+6:	xorl	%eax, %eax
+	movl	%eax, 4(%esp)
+	jmp	1b
+7:	xorl	%eax, %eax
+	movl	%eax, 8(%esp)
+	jmp	2b
+8:	xorl	%eax, %eax
+	movl	%eax, 12(%esp)
+	jmp	3b
+9:	xorl	%eax, %eax
+	movl	%eax, 16(%esp)
+	jmp	4b
 .previous
 	_ASM_EXTABLE(1b,6b)
 	_ASM_EXTABLE(2b,7b)
 	_ASM_EXTABLE(3b,8b)
 	_ASM_EXTABLE(4b,9b)
 #endif
-	CFI_ENDPROC
 
 ENTRY(coprocessor_error)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_coprocessor_error
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$0
+	pushl	$do_coprocessor_error
+	jmp	error_code
 END(coprocessor_error)
 
 ENTRY(simd_coprocessor_error)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
+	pushl	$0
 #ifdef CONFIG_X86_INVD_BUG
 	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-	ALTERNATIVE "pushl_cfi $do_general_protection",	\
-		    "pushl $do_simd_coprocessor_error", \
+	ALTERNATIVE "pushl	$do_general_protection",	\
+		    "pushl	$do_simd_coprocessor_error",	\
 		    X86_FEATURE_XMM
 #else
-	pushl_cfi $do_simd_coprocessor_error
+	pushl	$do_simd_coprocessor_error
 #endif
-	jmp error_code
-	CFI_ENDPROC
+	jmp	error_code
 END(simd_coprocessor_error)
 
 ENTRY(device_not_available)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $-1			# mark this as an int
-	pushl_cfi $do_device_not_available
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$-1				# mark this as an int
+	pushl	$do_device_not_available
+	jmp	error_code
 END(device_not_available)
 
 #ifdef CONFIG_PARAVIRT
@@ -1035,109 +921,85 @@ END(native_irq_enable_sysexit)
 #endif
 
 ENTRY(overflow)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_overflow
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$0
+	pushl	$do_overflow
+	jmp	error_code
 END(overflow)
 
 ENTRY(bounds)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_bounds
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$0
+	pushl	$do_bounds
+	jmp	error_code
 END(bounds)
 
 ENTRY(invalid_op)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_invalid_op
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$0
+	pushl	$do_invalid_op
+	jmp	error_code
 END(invalid_op)
 
 ENTRY(coprocessor_segment_overrun)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_coprocessor_segment_overrun
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$0
+	pushl	$do_coprocessor_segment_overrun
+	jmp	error_code
 END(coprocessor_segment_overrun)
 
 ENTRY(invalid_TSS)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $do_invalid_TSS
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$do_invalid_TSS
+	jmp	error_code
 END(invalid_TSS)
 
 ENTRY(segment_not_present)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $do_segment_not_present
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$do_segment_not_present
+	jmp	error_code
 END(segment_not_present)
 
 ENTRY(stack_segment)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $do_stack_segment
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$do_stack_segment
+	jmp	error_code
 END(stack_segment)
 
 ENTRY(alignment_check)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $do_alignment_check
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$do_alignment_check
+	jmp	error_code
 END(alignment_check)
 
 ENTRY(divide_error)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0			# no error code
-	pushl_cfi $do_divide_error
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$0				# no error code
+	pushl	$do_divide_error
+	jmp	error_code
 END(divide_error)
 
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi machine_check_vector
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$0
+	pushl	machine_check_vector
+	jmp	error_code
 END(machine_check)
 #endif
 
 #ifndef CONFIG_XEN
 ENTRY(spurious_interrupt_bug)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_spurious_interrupt_bug
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$0
+	pushl	$do_spurious_interrupt_bug
+	jmp	error_code
 END(spurious_interrupt_bug)
 #endif /* !CONFIG_XEN */
 
 ENTRY(fixup_4gb_segment)
-	RING0_EC_FRAME
-	pushl_cfi $do_fixup_4gb_segment
+	pushl	$do_fixup_4gb_segment
 	jmp error_code
-	CFI_ENDPROC
 END(fixup_4gb_segment)
 
 #ifdef CONFIG_FUNCTION_TRACER
@@ -1148,28 +1010,28 @@ ENTRY(mcount)
 END(mcount)
 
 ENTRY(ftrace_caller)
-	pushl %eax
-	pushl %ecx
-	pushl %edx
-	pushl $0	/* Pass NULL as regs pointer */
-	movl 4*4(%esp), %eax
-	movl 0x4(%ebp), %edx
-	movl function_trace_op, %ecx
-	subl $MCOUNT_INSN_SIZE, %eax
+	pushl	%eax
+	pushl	%ecx
+	pushl	%edx
+	pushl	$0				/* Pass NULL as regs pointer */
+	movl	4*4(%esp), %eax
+	movl	0x4(%ebp), %edx
+	movl	function_trace_op, %ecx
+	subl	$MCOUNT_INSN_SIZE, %eax
 
 .globl ftrace_call
 ftrace_call:
-	call ftrace_stub
+	call	ftrace_stub
 
-	addl $4,%esp	/* skip NULL pointer */
-	popl %edx
-	popl %ecx
-	popl %eax
+	addl	$4, %esp			/* skip NULL pointer */
+	popl	%edx
+	popl	%ecx
+	popl	%eax
 ftrace_ret:
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 .globl ftrace_graph_call
 ftrace_graph_call:
-	jmp ftrace_stub
+	jmp	ftrace_stub
 #endif
 
 .globl ftrace_stub
@@ -1187,72 +1049,72 @@ ENTRY(ftrace_regs_caller)
 	 * as the current return ip is. We move the return ip into the
 	 * ip location, and move flags into the return ip location.
 	 */
-	pushl 4(%esp)	/* save return ip into ip slot */
+	pushl	4(%esp)				/* save return ip into ip slot */
 
-	pushl $0	/* Load 0 into orig_ax */
-	pushl %gs
-	pushl %fs
-	pushl %es
-	pushl %ds
-	pushl %eax
-	pushl %ebp
-	pushl %edi
-	pushl %esi
-	pushl %edx
-	pushl %ecx
-	pushl %ebx
-
-	movl 13*4(%esp), %eax	/* Get the saved flags */
-	movl %eax, 14*4(%esp)	/* Move saved flags into regs->flags location */
-				/* clobbering return ip */
-	movl $__KERNEL_CS,13*4(%esp)
-
-	movl 12*4(%esp), %eax	/* Load ip (1st parameter) */
-	subl $MCOUNT_INSN_SIZE, %eax	/* Adjust ip */
-	movl 0x4(%ebp), %edx	/* Load parent ip (2nd parameter) */
-	movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
-	pushl %esp		/* Save pt_regs as 4th parameter */
+	pushl	$0				/* Load 0 into orig_ax */
+	pushl	%gs
+	pushl	%fs
+	pushl	%es
+	pushl	%ds
+	pushl	%eax
+	pushl	%ebp
+	pushl	%edi
+	pushl	%esi
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
+
+	movl	13*4(%esp), %eax		/* Get the saved flags */
+	movl	%eax, 14*4(%esp)		/* Move saved flags into regs->flags location */
+						/* clobbering return ip */
+	movl	$__KERNEL_CS, 13*4(%esp)
+
+	movl	12*4(%esp), %eax		/* Load ip (1st parameter) */
+	subl	$MCOUNT_INSN_SIZE, %eax		/* Adjust ip */
+	movl	0x4(%ebp), %edx			/* Load parent ip (2nd parameter) */
+	movl	function_trace_op, %ecx		/* Save ftrace_pos in 3rd parameter */
+	pushl	%esp				/* Save pt_regs as 4th parameter */
 
 GLOBAL(ftrace_regs_call)
-	call ftrace_stub
+	call	ftrace_stub
 
-	addl $4, %esp		/* Skip pt_regs */
-	movl 14*4(%esp), %eax	/* Move flags back into cs */
-	movl %eax, 13*4(%esp)	/* Needed to keep addl from modifying flags */
-	movl 12*4(%esp), %eax	/* Get return ip from regs->ip */
-	movl %eax, 14*4(%esp)	/* Put return ip back for ret */
-
-	popl %ebx
-	popl %ecx
-	popl %edx
-	popl %esi
-	popl %edi
-	popl %ebp
-	popl %eax
-	popl %ds
-	popl %es
-	popl %fs
-	popl %gs
-	addl $8, %esp		/* Skip orig_ax and ip */
-	popf			/* Pop flags at end (no addl to corrupt flags) */
-	jmp ftrace_ret
+	addl	$4, %esp			/* Skip pt_regs */
+	movl	14*4(%esp), %eax		/* Move flags back into cs */
+	movl	%eax, 13*4(%esp)		/* Needed to keep addl	from modifying flags */
+	movl	12*4(%esp), %eax		/* Get return ip from regs->ip */
+	movl	%eax, 14*4(%esp)		/* Put return ip back for ret */
+
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	popl	%eax
+	popl	%ds
+	popl	%es
+	popl	%fs
+	popl	%gs
+	addl	$8, %esp			/* Skip orig_ax and ip */
+	popf					/* Pop flags at end (no addl to corrupt flags) */
+	jmp	ftrace_ret
 
 	popf
-	jmp  ftrace_stub
+	jmp	ftrace_stub
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 
 ENTRY(mcount)
-	cmpl $__PAGE_OFFSET, %esp
-	jb ftrace_stub		/* Paging not enabled yet? */
+	cmpl	$__PAGE_OFFSET, %esp
+	jb	ftrace_stub			/* Paging not enabled yet? */
 
-	cmpl $ftrace_stub, ftrace_trace_function
-	jnz trace
+	cmpl	$ftrace_stub, ftrace_trace_function
+	jnz	trace
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	cmpl $ftrace_stub, ftrace_graph_return
-	jnz ftrace_graph_caller
+	cmpl	$ftrace_stub, ftrace_graph_return
+	jnz	ftrace_graph_caller
 
-	cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
-	jnz ftrace_graph_caller
+	cmpl	$ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz	ftrace_graph_caller
 #endif
 .globl ftrace_stub
 ftrace_stub:
@@ -1260,150 +1122,144 @@ ftrace_stub:
 
 	/* taken from glibc */
 trace:
-	pushl %eax
-	pushl %ecx
-	pushl %edx
-	movl 0xc(%esp), %eax
-	movl 0x4(%ebp), %edx
-	subl $MCOUNT_INSN_SIZE, %eax
-
-	call *ftrace_trace_function
-
-	popl %edx
-	popl %ecx
-	popl %eax
-	jmp ftrace_stub
+	pushl	%eax
+	pushl	%ecx
+	pushl	%edx
+	movl	0xc(%esp), %eax
+	movl	0x4(%ebp), %edx
+	subl	$MCOUNT_INSN_SIZE, %eax
+
+	call	*ftrace_trace_function
+
+	popl	%edx
+	popl	%ecx
+	popl	%eax
+	jmp	ftrace_stub
 END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 ENTRY(ftrace_graph_caller)
-	pushl %eax
-	pushl %ecx
-	pushl %edx
-	movl 0xc(%esp), %eax
-	lea 0x4(%ebp), %edx
-	movl (%ebp), %ecx
-	subl $MCOUNT_INSN_SIZE, %eax
-	call prepare_ftrace_return
-	popl %edx
-	popl %ecx
-	popl %eax
+	pushl	%eax
+	pushl	%ecx
+	pushl	%edx
+	movl	0xc(%esp), %eax
+	lea	0x4(%ebp), %edx
+	movl	(%ebp), %ecx
+	subl	$MCOUNT_INSN_SIZE, %eax
+	call	prepare_ftrace_return
+	popl	%edx
+	popl	%ecx
+	popl	%eax
 	ret
 END(ftrace_graph_caller)
 
 .globl return_to_handler
 return_to_handler:
-	pushl %eax
-	pushl %edx
-	movl %ebp, %eax
-	call ftrace_return_to_handler
-	movl %eax, %ecx
-	popl %edx
-	popl %eax
-	jmp *%ecx
+	pushl	%eax
+	pushl	%edx
+	movl	%ebp, %eax
+	call	ftrace_return_to_handler
+	movl	%eax, %ecx
+	popl	%edx
+	popl	%eax
+	jmp	*%ecx
 #endif
 
 #ifdef TIF_CSTAR
 	# pv syscall call handler stub
-ENTRY(ia32pv_cstar_target)
-	RING0_INT_FRAME
+ENTRY(entry_SYSCALL_PV32)
 	ASM_CLAC
-	movl $__USER_DS,16(%esp)
-	movl %ebp,%ecx
-	movl $__USER_CS,4(%esp)
-	movl 12(%esp),%ebp
-	pushl_cfi %eax			# save orig_eax
+	movl	$__USER_DS, 16(%esp)
+	movl	%ebp, %ecx
+	movl	$__USER_CS, 4(%esp)
+	movl	12(%esp), %ebp
+	pushl	%eax				# save orig_eax
 /*
  * Load the potential sixth argument from user stack.
  * Careful about security.
  */
-	cmpl $__PAGE_OFFSET-4,%ebp
-	CFI_REMEMBER_STATE
-	ja cstar_fault
+	cmpl	$__PAGE_OFFSET-4, %ebp
+	ja	cstar_fault
 	ASM_STAC
-1:	movl (%ebp),%ebp
+1:	movl	(%ebp), %ebp
 	ASM_CLAC
 	_ASM_EXTABLE(1b,cstar_fault)
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
-	jnz cstar_trace_entry
-	cmpl $NR_syscalls,%eax
-	jae cstar_badsys
+	testl	$_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
+	jnz	cstar_trace_entry
+	cmpl	$NR_syscalls, %eax
+	jae	cstar_badsys
 .Lcstar_call:
-	btl %eax,cstar_special
-	jc .Lcstar_special
-	call *cstar_call_table(,%eax,4)
+	btl	%eax, cstar_special
+	jc	.Lcstar_special
+	call	*cstar_call_table(,%eax,4)
 .Lcstar_after_call:
-	movl %eax,PT_EAX(%esp)		# store the return value
+	movl	%eax, PT_EAX(%esp)		# store the return value
 .Lcstar_exit:
-	movl PT_ECX(%esp),%ecx
-	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
-	jmp syscall_exit
+	movl	PT_ECX(%esp), %ecx
+	movl	%ecx, PT_EBP(%esp)		# put user EBP back in place
+	jmp	syscall_exit
 .Lcstar_special:
-	movl PT_ECX(%esp),%ecx
-	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
-	jmp syscall_call
+	movl	PT_ECX(%esp), %ecx
+	movl	%ecx,PT_EBP(%esp)		# put user EBP back in place
+	jmp	syscall_call
 GLOBAL(cstar_set_tif)
-	movl $cstar_clear_tif,(%esp)	# replace return address
+	movl	$cstar_clear_tif, (%esp)	# replace return address
 	LOCK_PREFIX
-	orl $_TIF_CSTAR,TI_flags(%ebp)
-	jmp *sys_call_table(,%eax,4)
+	orl	$_TIF_CSTAR, TI_flags(%ebp)
+	jmp	*sys_call_table(,%eax,4)
 cstar_clear_tif:
 	LOCK_PREFIX
-	andl $~_TIF_CSTAR,TI_flags(%ebp)
-	jmp .Lcstar_after_call
+	andl	$~_TIF_CSTAR, TI_flags(%ebp)
+	jmp	.Lcstar_after_call
 cstar_trace_entry:
-	movl $-ENOSYS,PT_EAX(%esp)
-	cmpl $NR_syscalls,%eax
-	jae 1f
-	btl %eax,cstar_special
-	jc .Lcstar_trace_special
-1:	movl %esp,%eax
+	movl	$-ENOSYS, PT_EAX(%esp)
+	cmpl	$NR_syscalls, %eax
+	jae	1f
+	btl	%eax, cstar_special
+	jc	.Lcstar_trace_special
+1:	movl	%esp, %eax
 	LOCK_PREFIX
-	orl $_TIF_CSTAR,TI_flags(%ebp)
-	call syscall_trace_enter
+	orl	$_TIF_CSTAR, TI_flags(%ebp)
+	call	syscall_trace_enter
 	LOCK_PREFIX
-	andl $~_TIF_CSTAR,TI_flags(%ebp)
+	andl	$~_TIF_CSTAR, TI_flags(%ebp)
 	/* What it returned is what we'll actually use.  */
-	cmpl $NR_syscalls,%eax
-	jb .Lcstar_call
-	jmp .Lcstar_exit
+	cmpl	$NR_syscalls, %eax
+	jb	.Lcstar_call
+	jmp	.Lcstar_exit
 .Lcstar_trace_special:
-	movl PT_ECX(%esp),%ecx
-	movl %esp,%eax
-	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
-	call syscall_trace_enter
+	movl	PT_ECX(%esp), %ecx
+	movl	%esp, %eax
+	movl	%ecx, PT_EBP(%esp)		# put user EBP back in place
+	call	syscall_trace_enter
 	/* What it returned is what we'll actually use.  */
-	cmpl $NR_syscalls,%eax
-	jb syscall_call
-	jmp syscall_exit
+	cmpl	$NR_syscalls, %eax
+	jb	syscall_call
+	jmp	syscall_exit
 cstar_badsys:
-	movl $-ENOSYS,%eax
-	jmp .Lcstar_after_call
-	CFI_RESTORE_STATE
+	movl	$-ENOSYS, %eax
+	jmp	.Lcstar_after_call
 cstar_fault:
 	ASM_CLAC
-	movl $-EFAULT,%eax
+	movl	$-EFAULT, %eax
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-	movl PT_ECX(%esp),%ecx
-	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
-	jmp resume_userspace
-	CFI_ENDPROC
-ENDPROC(ia32pv_cstar_target)
+	movl	PT_ECX(%esp), %ecx
+	movl	%ecx, PT_EBP(%esp)		# put user EBP back in place
+	jmp	resume_userspace
+ENDPROC(entry_SYSCALL_PV32)
 
 ENTRY(cstar_ret_from_fork)
-	CFI_STARTPROC
-	movl PT_ECX(%esp),%ecx
+	movl	PT_ECX(%esp), %ecx
 	GET_THREAD_INFO(%ebp)
-	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
+	movl	%ecx, PT_EBP(%esp)		# put user EBP back in place
 	LOCK_PREFIX
-	andl $~_TIF_CSTAR,TI_flags(%ebp)
-	jmp ret_from_fork
-	CFI_ENDPROC
+	andl	$~_TIF_CSTAR, TI_flags(%ebp)
+	jmp	ret_from_fork
 END(cstar_ret_from_fork)
 
 #include <asm/unistd.h>
@@ -1429,52 +1285,45 @@ mask=0
 
 #ifdef CONFIG_TRACING
 ENTRY(trace_page_fault)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $trace_do_page_fault
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$trace_do_page_fault
+	jmp	error_code
 END(trace_page_fault)
 #endif
 
 ENTRY(page_fault)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $do_page_fault
+	pushl	$do_page_fault
 	ALIGN
 error_code:
 	/* the function address is in %gs's slot on the stack */
-	pushl_cfi %fs
-	/*CFI_REL_OFFSET fs, 0*/
-	pushl_cfi %es
-	/*CFI_REL_OFFSET es, 0*/
-	pushl_cfi %ds
-	/*CFI_REL_OFFSET ds, 0*/
-	pushl_cfi_reg eax
-	pushl_cfi_reg ebp
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
-	pushl_cfi_reg edx
-	pushl_cfi_reg ecx
-	pushl_cfi_reg ebx
+	pushl	%fs
+	pushl	%es
+	pushl	%ds
+	pushl	%eax
+	pushl	%ebp
+	pushl	%edi
+	pushl	%esi
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
 	cld
-	movl $(__KERNEL_PERCPU), %ecx
-	movl %ecx, %fs
+	movl	$(__KERNEL_PERCPU), %ecx
+	movl	%ecx, %fs
 	UNWIND_ESPFIX_STACK
 	GS_TO_REG %ecx
-	movl PT_GS(%esp), %edi		# get the function address
-	movl PT_ORIG_EAX(%esp), %edx	# get the error code
-	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
+	movl	PT_GS(%esp), %edi		# get the function address
+	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
+	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
 	REG_TO_PTGS %ecx
 	SET_KERNEL_GS %ecx
-	movl $(__USER_DS), %ecx
-	movl %ecx, %ds
-	movl %ecx, %es
+	movl	$(__USER_DS), %ecx
+	movl	%ecx, %ds
+	movl	%ecx, %es
 	TRACE_IRQS_OFF
-	movl %esp,%eax			# pt_regs pointer
-	call *%edi
-	jmp ret_from_exception
-	CFI_ENDPROC
+	movl	%esp, %eax			# pt_regs pointer
+	call	*%edi
+	jmp	ret_from_exception
 END(page_fault)
 
 #ifndef CONFIG_XEN
@@ -1492,36 +1341,31 @@ END(page_fault)
  * the instruction that would have done it for sysenter.
  */
 .macro FIX_STACK offset ok label
-	cmpw $__KERNEL_CS, 4(%esp)
-	jne \ok
+	cmpw	$__KERNEL_CS, 4(%esp)
+	jne	\ok
 \label:
-	movl TSS_sysenter_sp0 + \offset(%esp), %esp
-	CFI_DEF_CFA esp, 0
-	CFI_UNDEFINED eip
-	pushfl_cfi
-	pushl_cfi $__KERNEL_CS
-	pushl_cfi $sysenter_past_esp
-	CFI_REL_OFFSET eip, 0
+	movl	TSS_sysenter_sp0 + \offset(%esp), %esp
+	pushfl
+	pushl	$__KERNEL_CS
+	pushl	$sysenter_past_esp
 .endm
 #endif /* CONFIG_XEN */
 
 ENTRY(debug)
-	RING0_INT_FRAME
 	ASM_CLAC
 #ifndef CONFIG_XEN
-	cmpl $ia32_sysenter_target,(%esp)
-	jne debug_stack_correct
+	cmpl	$entry_SYSENTER_32, (%esp)
+	jne	debug_stack_correct
 	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
 debug_stack_correct:
 #endif /* !CONFIG_XEN */
-	pushl_cfi $-1			# mark this as an int
+	pushl	$-1				# mark this as an int
 	SAVE_ALL
 	TRACE_IRQS_OFF
-	xorl %edx,%edx			# error code 0
-	movl %esp,%eax			# pt_regs pointer
-	call do_debug
-	jmp ret_from_exception
-	CFI_ENDPROC
+	xorl	%edx, %edx			# error code 0
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_debug
+	jmp	ret_from_exception
 END(debug)
 
 /*
@@ -1533,117 +1377,103 @@ END(debug)
  * fault happened on the sysenter path.
  */
 ENTRY(nmi)
-	RING0_INT_FRAME
 	ASM_CLAC
 #ifdef CONFIG_X86_ESPFIX32
-	pushl_cfi %eax
-	movl %ss, %eax
-	cmpw $__ESPFIX_SS, %ax
-	popl_cfi %eax
-	je nmi_espfix_stack
+	pushl	%eax
+	movl	%ss, %eax
+	cmpw	$__ESPFIX_SS, %ax
+	popl	%eax
+	je	nmi_espfix_stack
 #endif
 #ifndef CONFIG_XEN
-	cmpl $ia32_sysenter_target,(%esp)
-	je nmi_stack_fixup
-	pushl_cfi %eax
-	movl %esp,%eax
-	/* Do not access memory above the end of our stack page,
+	cmpl	$entry_SYSENTER_32, (%esp)
+	je	nmi_stack_fixup
+	pushl	%eax
+	movl	%esp, %eax
+	/*
+	 * Do not access memory above the end of our stack page,
 	 * it might not exist.
 	 */
-	andl $(THREAD_SIZE-1),%eax
-	cmpl $(THREAD_SIZE-20),%eax
-	popl_cfi %eax
-	jae nmi_stack_correct
-	cmpl $ia32_sysenter_target,12(%esp)
-	je nmi_debug_stack_check
+	andl	$(THREAD_SIZE-1), %eax
+	cmpl	$(THREAD_SIZE-20), %eax
+	popl	%eax
+	jae	nmi_stack_correct
+	cmpl	$entry_SYSENTER_32, 12(%esp)
+	je	nmi_debug_stack_check
 nmi_stack_correct:
-	/* We have a RING0_INT_FRAME here */
-	pushl_cfi %eax
+	pushl	%eax
 	SAVE_ALL
-	xorl %edx,%edx		# zero error code
-	movl %esp,%eax		# pt_regs pointer
-	call do_nmi
-	jmp restore_all_notrace
-	CFI_ENDPROC
+	xorl	%edx, %edx			# zero error code
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_nmi
+	jmp	restore_all_notrace
 
 nmi_stack_fixup:
-	RING0_INT_FRAME
 	FIX_STACK 12, nmi_stack_correct, 1
-	jmp nmi_stack_correct
+	jmp	nmi_stack_correct
 
 nmi_debug_stack_check:
-	/* We have a RING0_INT_FRAME here */
-	cmpw $__KERNEL_CS,16(%esp)
-	jne nmi_stack_correct
-	cmpl $debug,(%esp)
-	jb nmi_stack_correct
-	cmpl $debug_esp_fix_insn,(%esp)
-	ja nmi_stack_correct
+	cmpw	$__KERNEL_CS, 16(%esp)
+	jne	nmi_stack_correct
+	cmpl	$debug, (%esp)
+	jb	nmi_stack_correct
+	cmpl	$debug_esp_fix_insn, (%esp)
+	ja	nmi_stack_correct
 	FIX_STACK 24, nmi_stack_correct, 1
-	jmp nmi_stack_correct
+	jmp	nmi_stack_correct
 
 #ifdef CONFIG_X86_ESPFIX32
 nmi_espfix_stack:
-	/* We have a RING0_INT_FRAME here.
-	 *
+	/*
 	 * create the pointer to lss back
 	 */
-	pushl_cfi %ss
-	pushl_cfi %esp
-	addl $4, (%esp)
+	pushl	%ss
+	pushl	%esp
+	addl	$4, (%esp)
 	/* copy the iret frame of 12 bytes */
 	.rept 3
-	pushl_cfi 16(%esp)
+	pushl	16(%esp)
 	.endr
-	pushl_cfi %eax
+	pushl	%eax
 	SAVE_ALL
-	FIXUP_ESPFIX_STACK		# %eax == %esp
-	xorl %edx,%edx			# zero error code
-	call do_nmi
+	FIXUP_ESPFIX_STACK			# %eax == %esp
+	xorl	%edx, %edx			# zero error code
+	call	do_nmi
 	RESTORE_REGS
-	lss 12+4(%esp), %esp		# back to espfix stack
-	CFI_ADJUST_CFA_OFFSET -24
-	jmp irq_return
+	lss	12+4(%esp), %esp		# back to espfix stack
+	jmp	irq_return
 #endif
 #else /* CONFIG_XEN */
- 	pushl_cfi %eax
+ 	pushl	%eax
 	SAVE_ALL
-	xorl %edx,%edx		# zero error code
-	movl %esp,%eax		# pt_regs pointer
-	call do_nmi
-	orl  $NMI_MASK, PT_EFLAGS(%esp)
-	jmp restore_all
+	xorl	%edx, %edx			# zero error code
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_nmi
+	orl	$NMI_MASK, PT_EFLAGS(%esp)
+	jmp	restore_all
 #endif
-	CFI_ENDPROC
 END(nmi)
 
 ENTRY(int3)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $-1			# mark this as an int
+	pushl	$-1				# mark this as an int
 	SAVE_ALL
 	TRACE_IRQS_OFF
-	xorl %edx,%edx		# zero error code
-	movl %esp,%eax		# pt_regs pointer
-	call do_int3
-	jmp ret_from_exception
-	CFI_ENDPROC
+	xorl	%edx, %edx			# zero error code
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_int3
+	jmp	ret_from_exception
 END(int3)
 
 ENTRY(general_protection)
-	RING0_EC_FRAME
-	pushl_cfi $do_general_protection
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$do_general_protection
+	jmp	error_code
 END(general_protection)
 
 #ifdef CONFIG_KVM_GUEST
 ENTRY(async_page_fault)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $do_async_page_fault
-	jmp error_code
-	CFI_ENDPROC
+	pushl	$do_async_page_fault
+	jmp	error_code
 END(async_page_fault)
 #endif
-
--- a/arch/x86/entry/entry_64-xen.S
+++ b/arch/x86/entry/entry_64-xen.S
@@ -7,34 +7,25 @@
  *  Jun Nakajima <jun.nakajima@intel.com>
  *  Asit Mallick <asit.k.mallick@intel.com>
  *      Modified for Xen
- */
-
-/*
+ *
  * entry.S contains the system-call and fault low-level handling routines.
  *
  * Some of this is documented in Documentation/x86/entry_64.txt
  *
- * NOTE: This code handles signal-recognition, which happens every time
- * after an interrupt and after each system call.
- *
  * A note on terminology:
- * - iret frame: Architecture defined interrupt frame from SS to RIP
- * at the top of the kernel process stack.
+ * - iret frame:	Architecture defined interrupt frame from SS to RIP
+ *			at the top of the kernel process stack.
  *
  * Some macro usage:
- * - CFI macros are used to generate dwarf2 unwind information for better
- * backtraces. They don't change any code.
- * - ENTRY/END Define functions in the symbol table.
- * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - idtentry - Define exception entry points.
+ * - ENTRY/END:		Define functions in the symbol table.
+ * - TRACE_IRQ_*:	Trace hardirq state for lock debugging.
+ * - idtentry:		Define exception entry points.
  */
-
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/cache.h>
 #include <asm/errno.h>
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
+#include "calling.h"
 #include <asm/asm-offsets.h>
 #include <asm/msr.h>
 #include <asm/unistd.h>
@@ -54,18 +45,17 @@
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
-#define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_64BIT 0x80000000
-#define __AUDIT_ARCH_LE	   0x40000000
-
-	.code64
-	.section .entry.text, "ax"
+#define AUDIT_ARCH_X86_64			(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_64BIT			0x80000000
+#define __AUDIT_ARCH_LE				0x40000000
 
+.code64
+.section .entry.text, "ax"
 
 .macro TRACE_IRQS_IRETQ
 #ifdef CONFIG_TRACE_IRQFLAGS
-	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
-	jnc  1f
+	bt	$9, EFLAGS(%rsp)		/* interrupts off? */
+	jnc	1f
 	TRACE_IRQS_ON
 1:
 #endif
@@ -85,135 +75,65 @@
 #if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
 
 .macro TRACE_IRQS_OFF_DEBUG
-	call debug_stack_set_zero
+	call	debug_stack_set_zero
 	TRACE_IRQS_OFF
-	call debug_stack_reset
+	call	debug_stack_reset
 .endm
 
 .macro TRACE_IRQS_ON_DEBUG
-	call debug_stack_set_zero
+	call	debug_stack_set_zero
 	TRACE_IRQS_ON
-	call debug_stack_reset
+	call	debug_stack_reset
 .endm
 
 .macro TRACE_IRQS_IRETQ_DEBUG
-	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
-	jnc  1f
+	bt	$9, EFLAGS(%rsp)		/* interrupts off? */
+	jnc	1f
 	TRACE_IRQS_ON_DEBUG
 1:
 .endm
 
 #else
-# define TRACE_IRQS_OFF_DEBUG		TRACE_IRQS_OFF
-# define TRACE_IRQS_ON_DEBUG		TRACE_IRQS_ON
-# define TRACE_IRQS_IRETQ_DEBUG		TRACE_IRQS_IRETQ
+# define TRACE_IRQS_OFF_DEBUG			TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG			TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG			TRACE_IRQS_IRETQ
 #endif
 
 NMI_MASK = 0x80000000
-	
-/*
- * empty frame
- */
-	.macro EMPTY_FRAME start=1 offset=0
-	.if \start
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA rsp,8+\offset
-	.else
-	CFI_DEF_CFA_OFFSET 8+\offset
-	.endif
-	.endm
-
-/*
- * initial frame state for syscall
- */
-	.macro BASIC_FRAME start=1 offset=0
-	EMPTY_FRAME \start, 5*8+\offset
-	/*CFI_REL_OFFSET ss, 4*8+\offset*/
-	CFI_REL_OFFSET rsp, 3*8+\offset
-	/*CFI_REL_OFFSET rflags, 2*8+\offset*/
-	/*CFI_REL_OFFSET cs, 1*8+\offset*/
-	CFI_REL_OFFSET rip, 0*8+\offset
-	.endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
- */
-	.macro INTR_FRAME start=1 offset=0
-	.if \start == 1
-	BASIC_FRAME 1, \offset+2*8
-	CFI_REL_OFFSET rcx, 0+\offset
-	CFI_REL_OFFSET r11, 8+\offset
-	.else
-	BASIC_FRAME \start, \offset
-	.endif
-	.endm
-
-/*
- * initial frame state for exceptions with error code (and interrupts
- * with vector already pushed)
- */
-	.macro XCPT_FRAME start=1 offset=0
-	INTR_FRAME \start, 1*8+\offset
-	.endm
-
-/*
- * frame that enables passing a complete pt_regs to a C function.
- */
-	.macro DEFAULT_FRAME start=1 offset=0 extra=1
-	XCPT_FRAME -(\start), ORIG_RAX+\offset
-	CFI_REL_OFFSET rdi, RDI+\offset
-	CFI_REL_OFFSET rsi, RSI+\offset
-	CFI_REL_OFFSET rdx, RDX+\offset
-	CFI_REL_OFFSET rcx, RCX+\offset
-	CFI_REL_OFFSET rax, RAX+\offset
-	CFI_REL_OFFSET r8, R8+\offset
-	CFI_REL_OFFSET r9, R9+\offset
-	CFI_REL_OFFSET r10, R10+\offset
-	CFI_REL_OFFSET r11, R11+\offset
-	.if \extra
-	CFI_REL_OFFSET rbx, RBX+\offset
-	CFI_REL_OFFSET rbp, RBP+\offset
-	CFI_REL_OFFSET r12, R12+\offset
-	CFI_REL_OFFSET r13, R13+\offset
-	CFI_REL_OFFSET r14, R14+\offset
-	CFI_REL_OFFSET r15, R15+\offset
-	.endif
-	.endm
 
-        /*
-         * Must be consistent with the definition in arch-x86/xen-x86_64.h:
-         *     struct iret_context {
-         *        u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
-         *     };
-         * with rax, r11, and rcx being taken care of in the hypercall stub.
-         */
+	/*
+	 * Must be consistent with the definition in arch-x86/xen-x86_64.h:
+	 *     struct iret_context {
+	 *        u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+	 *     };
+	 * with rax, r11, and rcx being taken care of in the hypercall stub.
+	 */
 	.macro HYPERVISOR_IRET flag
 	.if \flag == 0	# return from syscall always uses the hypercall
-	testb $3,1*8(%rsp)
-	jnz   2f
-	testl $NMI_MASK,2*8(%rsp)
-	jnz   2f
+	testb	$3, 1*8(%rsp)
+	jnz	2f
+	testl	$NMI_MASK, 2*8(%rsp)
+	jnz	2f
 
-	cmpb  $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip)
-	jne   1f
+	cmpb	$0, xen_features+XENFEAT_supervisor_mode_kernel(%rip)
+	jne	1f
 
 	/* Direct iret to kernel space. Correct CS and SS. */
-	orl   $3,1*8(%rsp)
-	orl   $3,4*8(%rsp)
+	orl	$3, 1*8(%rsp)
+	orl	$3, 4*8(%rsp)
 1:	iretq
 	.endif
 
 2:	/* Slow iret via hypervisor. */
-	andl  $~NMI_MASK, 2*8(%rsp)
-	pushq $\flag & VGCF_in_syscall
-	jmp  hypercall_page + (__HYPERVISOR_iret * 32)
+	andl 	$~NMI_MASK, 2*8(%rsp)
+	pushq	$\flag & VGCF_in_syscall
+	jmp	hypercall_page + (__HYPERVISOR_iret * 32)
 	.endm
 
 /*
- * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
+ * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
  *
- * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
  * then loads new ss, cs, and rip from previously programmed MSRs.
  * rflags gets masked by a value from another MSR (so CLD and CLAC
  * are not needed). SYSCALL does not save anything on the stack
@@ -229,7 +149,7 @@ NMI_MASK = 0x80000000
  * r10  arg3 (needs to be moved to rcx to conform to C ABI)
  * r8   arg4
  * r9   arg5
- * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
+ * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
  *
  * Only called from user space.
  *
@@ -238,40 +158,38 @@ NMI_MASK = 0x80000000
  * with them due to bugs in both AMD and Intel CPUs.
  */
 
-ENTRY(system_call)
-	INTR_FRAME start=2 offset=2*8
+ENTRY(entry_SYSCALL_64)
 	/*
 	 * Interrupts are enabled on entry.
 	 */
 	/* Construct struct pt_regs on stack */
-	movl		$__USER_DS,6*8(%rsp)	/* pt_regs->ss */
-	movl		$__USER_CS,3*8(%rsp)	/* pt_regs->cs */
-	movq_cfi	rax,8			/* pt_regs->orig_ax */
-	movq_cfi	rdi,0			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi	$-ENOSYS		/* pt_regs->ax */
-	pushq_cfi_reg	r8			/* pt_regs->r8 */
-	pushq_cfi_reg	r9			/* pt_regs->r9 */
-	pushq_cfi_reg	r10			/* pt_regs->r10 */
-	pushq_cfi_reg	r11			/* pt_regs->r11 */
-	sub	$(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 6*8
-
-	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz tracesys
-system_call_fastpath:
+	movl	$__USER_DS, 6*8(%rsp)		/* pt_regs->ss */
+	movl	$__USER_CS, 3*8(%rsp)		/* pt_regs->cs */
+	movq	%rax, 8(%rsp)			/* pt_regs->orig_ax */
+	movq	%rdi, 0(%rsp)			/* pt_regs->di */
+	pushq	%rsi				/* pt_regs->si */
+	pushq	%rdx				/* pt_regs->dx */
+	pushq	%rcx				/* pt_regs->cx */
+	pushq	$-ENOSYS			/* pt_regs->ax */
+	pushq	%r8				/* pt_regs->r8 */
+	pushq	%r9				/* pt_regs->r9 */
+	pushq	%r10				/* pt_regs->r10 */
+	pushq	%r11				/* pt_regs->r11 */
+	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
+
+	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	tracesys
+entry_SYSCALL_64_fastpath:
 #if __SYSCALL_MASK == ~0
-	cmpq $__NR_syscall_max,%rax
+	cmpq	$__NR_syscall_max, %rax
 #else
-	andl $__SYSCALL_MASK,%eax
-	cmpl $__NR_syscall_max,%eax
+	andl	$__SYSCALL_MASK, %eax
+	cmpl	$__NR_syscall_max, %eax
 #endif
-	ja	1f	/* return -ENOSYS (already in pt_regs->ax) */
-	movq %r10,%rcx
-	call *sys_call_table(,%rax,8)
-	movq %rax,RAX(%rsp)
+	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
+	movq	%r10, %rcx
+	call	*sys_call_table(, %rax, 8)
+	movq	%rax, RAX(%rsp)
 1:
 /*
  * Syscall return path ending with SYSRET (fast path).
@@ -292,36 +210,32 @@ system_call_fastpath:
 	 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
 	 * very bad.
 	 */
-	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz int_ret_from_sys_call_irqs_off	/* Go to the slow path */
-
-	CFI_REMEMBER_STATE
+	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	int_ret_from_sys_call_irqs_off	/* Go to the slow path */
 
 	RESTORE_C_REGS_EXCEPT_RCX_R11
 	REMOVE_PT_GPREGS_FROM_STACK 8
-	xor %ecx,%ecx
-	xor %r11,%r11
-        HYPERVISOR_IRET VGCF_IN_SYSCALL
-
-	CFI_RESTORE_STATE
+	xor	%ecx, %ecx
+	xor	%r11, %r11
+	HYPERVISOR_IRET VGCF_IN_SYSCALL
 
 	/* Do syscall entry tracing */
 tracesys:
-	movq %rsp, %rdi
-	movl $AUDIT_ARCH_X86_64, %esi
-	call syscall_trace_enter_phase1
-	test %rax, %rax
-	jnz tracesys_phase2		/* if needed, run the slow path */
-	RESTORE_C_REGS_EXCEPT_RAX	/* else restore clobbered regs */
-	movq ORIG_RAX(%rsp), %rax
-	jmp system_call_fastpath	/*      and return to the fast path */
+	movq	%rsp, %rdi
+	movl	$AUDIT_ARCH_X86_64, %esi
+	call	syscall_trace_enter_phase1
+	test	%rax, %rax
+	jnz	tracesys_phase2			/* if needed, run the slow path */
+	RESTORE_C_REGS_EXCEPT_RAX		/* else restore clobbered regs */
+	movq	ORIG_RAX(%rsp), %rax
+	jmp	entry_SYSCALL_64_fastpath	/* and return to the fast path */
 
 tracesys_phase2:
 	SAVE_EXTRA_REGS
-	movq %rsp, %rdi
-	movl $AUDIT_ARCH_X86_64, %esi
-	movq %rax,%rdx
-	call syscall_trace_enter_phase2
+	movq	%rsp, %rdi
+	movl	$AUDIT_ARCH_X86_64, %esi
+	movq	%rax, %rdx
+	call	syscall_trace_enter_phase2
 
 	/*
 	 * Reload registers from stack in case ptrace changed them.
@@ -331,15 +245,15 @@ tracesys_phase2:
 	RESTORE_C_REGS_EXCEPT_RAX
 	RESTORE_EXTRA_REGS
 #if __SYSCALL_MASK == ~0
-	cmpq $__NR_syscall_max,%rax
+	cmpq	$__NR_syscall_max, %rax
 #else
-	andl $__SYSCALL_MASK,%eax
-	cmpl $__NR_syscall_max,%eax
+	andl	$__SYSCALL_MASK, %eax
+	cmpl	$__NR_syscall_max, %eax
 #endif
-	ja	1f	/* return -ENOSYS (already in pt_regs->ax) */
-	movq %r10,%rcx	/* fixup for C */
-	call *sys_call_table(,%rax,8)
-	movq %rax,RAX(%rsp)
+	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
+	movq	%r10, %rcx			/* fixup for C */
+	call	*sys_call_table(, %rax, 8)
+	movq	%rax, RAX(%rsp)
 1:
 	/* Use IRET because user could have changed pt_regs->foo */
 
@@ -351,31 +265,33 @@ GLOBAL(int_ret_from_sys_call)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
 	TRACE_IRQS_OFF
-	movl $_TIF_ALLWORK_MASK,%edi
+	movl	$_TIF_ALLWORK_MASK, %edi
 	/* edi:	mask to check */
 GLOBAL(int_with_check)
 	LOCKDEP_SYS_EXIT_IRQ
 	GET_THREAD_INFO(%rcx)
-	movl TI_flags(%rcx),%edx
-	andl %edi,%edx
-	jnz   int_careful
-	andl	$~TS_COMPAT,TI_status(%rcx)
+	movl	TI_flags(%rcx), %edx
+	andl	%edi, %edx
+	jnz	int_careful
+	andl	$~TS_COMPAT, TI_status(%rcx)
 	jmp	restore_c_regs_and_iret
 
-	/* Either reschedule or signal or syscall exit tracking needed. */
-	/* First do a reschedule test. */
-	/* edx:	work, edi: workmask */
+	/*
+	 * Either reschedule or signal or syscall exit tracking needed.
+	 * First do a reschedule test.
+	 * edx:	work, edi: workmask
+	 */
 int_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc  int_very_careful
+	bt	$TIF_NEED_RESCHED, %edx
+	jnc	int_very_careful
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq_cfi %rdi
+	pushq	%rdi
 	SCHEDULE_USER
-	popq_cfi %rdi
+	popq	%rdi
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	jmp int_with_check
+	jmp	int_with_check
 
 	/* handle signals and tracing -- both require a full pt_regs */
 int_very_careful:
@@ -383,38 +299,34 @@ int_very_careful:
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_EXTRA_REGS
 	/* Check for syscall exit trace */
-	testl $_TIF_WORK_SYSCALL_EXIT,%edx
-	jz int_signal
-	pushq_cfi %rdi
-	leaq 8(%rsp),%rdi	# &ptregs -> arg1
-	call syscall_trace_leave
-	popq_cfi %rdi
-	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
-	jmp int_restore_rest
+	testl	$_TIF_WORK_SYSCALL_EXIT, %edx
+	jz	int_signal
+	pushq	%rdi
+	leaq	8(%rsp), %rdi			/* &ptregs -> arg1 */
+	call	syscall_trace_leave
+	popq	%rdi
+	andl	$~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi
+	jmp	int_restore_rest
 
 int_signal:
-	testl $_TIF_DO_NOTIFY_MASK,%edx
-	jz 1f
-	movq %rsp,%rdi		# &ptregs -> arg1
-	xorl %esi,%esi		# oldset -> arg2
-	call do_notify_resume
-1:	movl $_TIF_WORK_MASK,%edi
+	testl	$_TIF_DO_NOTIFY_MASK, %edx
+	jz	1f
+	movq	%rsp, %rdi			/* &ptregs -> arg1 */
+	xorl	%esi, %esi			/* oldset -> arg2 */
+	call	do_notify_resume
+1:	movl	$_TIF_WORK_MASK, %edi
 int_restore_rest:
 	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	jmp int_with_check
-	CFI_ENDPROC
-END(system_call)
+	jmp	int_with_check
+END(entry_SYSCALL_64)
 
 
 	.macro FORK_LIKE func
 ENTRY(stub_\func)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8, 0		/* offset 8: return address */
 	SAVE_EXTRA_REGS 8
-	jmp sys_\func
-	CFI_ENDPROC
+	jmp	sys_\func
 END(stub_\func)
 	.endm
 
@@ -423,8 +335,6 @@ END(stub_\func)
 	FORK_LIKE  vfork
 
 ENTRY(stub_execve)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8, 0
 	call	sys_execve
 return_from_execve:
 	testl	%eax, %eax
@@ -434,11 +344,9 @@ return_from_execve:
 1:
 	/* must use IRET code path (pt_regs->cs may have changed) */
 	addq	$8, %rsp
-	CFI_ADJUST_CFA_OFFSET -8
 	ZERO_EXTRA_REGS
-	movq	%rax,RAX(%rsp)
+	movq	%rax, RAX(%rsp)
 	jmp	int_ret_from_sys_call
-	CFI_ENDPROC
 END(stub_execve)
 /*
  * Remaining execve stubs are only 7 bytes long.
@@ -446,47 +354,25 @@ END(stub_execve)
  */
 	.align	8
 GLOBAL(stub_execveat)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8, 0
 	call	sys_execveat
 	jmp	return_from_execve
-	CFI_ENDPROC
 END(stub_execveat)
 
-#ifdef CONFIG_X86_X32_ABI
+#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
 	.align	8
 GLOBAL(stub_x32_execve)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8, 0
-	call	compat_sys_execve
-	jmp	return_from_execve
-	CFI_ENDPROC
-END(stub_x32_execve)
-	.align	8
-GLOBAL(stub_x32_execveat)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8, 0
-	call	compat_sys_execveat
-	jmp	return_from_execve
-	CFI_ENDPROC
-END(stub_x32_execveat)
-#endif
-
-#ifdef CONFIG_IA32_EMULATION
-	.align	8
 GLOBAL(stub32_execve)
-	CFI_STARTPROC
 	call	compat_sys_execve
 	jmp	return_from_execve
-	CFI_ENDPROC
 END(stub32_execve)
+END(stub_x32_execve)
 	.align	8
+GLOBAL(stub_x32_execveat)
 GLOBAL(stub32_execveat)
-	CFI_STARTPROC
 	call	compat_sys_execveat
 	jmp	return_from_execve
-	CFI_ENDPROC
 END(stub32_execveat)
+END(stub_x32_execveat)
 #endif
 
 /*
@@ -494,8 +380,6 @@ END(stub32_execveat)
  * This cannot be done with SYSRET, so use the IRET return path instead.
  */
 ENTRY(stub_rt_sigreturn)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8, 0
 	/*
 	 * SAVE_EXTRA_REGS result is not normally needed:
 	 * sigreturn overwrites all pt_regs->GPREGS.
@@ -504,24 +388,19 @@ ENTRY(stub_rt_sigreturn)
 	 * we SAVE_EXTRA_REGS here.
 	 */
 	SAVE_EXTRA_REGS 8
-	call sys_rt_sigreturn
+	call	sys_rt_sigreturn
 return_from_stub:
 	addq	$8, %rsp
-	CFI_ADJUST_CFA_OFFSET -8
 	RESTORE_EXTRA_REGS
-	movq %rax,RAX(%rsp)
-	jmp int_ret_from_sys_call
-	CFI_ENDPROC
+	movq	%rax, RAX(%rsp)
+	jmp	int_ret_from_sys_call
 END(stub_rt_sigreturn)
 
 #ifdef CONFIG_X86_X32_ABI
 ENTRY(stub_x32_rt_sigreturn)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8, 0
 	SAVE_EXTRA_REGS 8
-	call sys32_x32_rt_sigreturn
-	jmp  return_from_stub
-	CFI_ENDPROC
+	call	sys32_x32_rt_sigreturn
+	jmp	return_from_stub
 END(stub_x32_rt_sigreturn)
 #endif
 
@@ -531,53 +410,55 @@ END(stub_x32_rt_sigreturn)
  * rdi: prev task we switched from
  */
 ENTRY(ret_from_fork)
-	DEFAULT_FRAME
 
-	LOCK ; btr $TIF_FORK,TI_flags(%r8)
+	LOCK ; btr $TIF_FORK, TI_flags(%r8)
 
-	pushq_cfi $0x0002
-	popfq_cfi				# reset kernel eflags
+	pushq	$0x0002
+	popfq					/* reset kernel eflags */
 
-	call schedule_tail			# rdi: 'prev' task parameter
+	call	schedule_tail			/* rdi: 'prev' task parameter */
 
 	RESTORE_EXTRA_REGS
 
-	testl $3,CS(%rsp)			# from kernel_thread?
+	testb	$3, CS(%rsp)			/* from kernel_thread? */
 
 	/*
 	 * By the time we get here, we have no idea whether our pt_regs,
 	 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
-	 * the slow path, or one of the ia32entry paths.
+	 * the slow path, or one of the 32-bit compat paths.
 	 * Use IRET code path to return, since it can safely handle
 	 * all of the above.
 	 */
 	jnz	int_ret_from_sys_call
 
-	/* We came from kernel_thread */
-	/* Need to set the proper %ss (not NULL) for ring 3 iretq */
-	movl $__KERNEL_DS, SS(%rsp)
+	/*
+	 * We came from kernel_thread
+	 * Need to set the proper %ss (not NULL) for ring 3 iretq
+	 */
+	movl	$__KERNEL_DS, SS(%rsp)
 	/* nb: we depend on RESTORE_EXTRA_REGS above */
-	movq %rbp, %rdi
-	call *%rbx
-	movl $0, RAX(%rsp)
+	movq	%rbp, %rdi
+	call	*%rbx
+	movl	$0, RAX(%rsp)
 	RESTORE_EXTRA_REGS
-	jmp int_ret_from_sys_call
-	CFI_ENDPROC
+	jmp	int_ret_from_sys_call
 END(ret_from_fork)
 
 /*
  * Interrupt exit.
- */ 
+ */
+	/* Interrupt came from user space */
+retint_user:
+	GET_THREAD_INFO(%rcx)
 
+	/* %rcx: thread info. Interrupts are off. */
 retint_with_reschedule:
-	DEFAULT_FRAME extra=0
-	movl $_TIF_WORK_MASK,%edi
+	movl	$_TIF_WORK_MASK, %edi
 retint_check:
 	LOCKDEP_SYS_EXIT_IRQ
-	movl TI_flags(%rcx),%edx
-	andl %edi,%edx
-	CFI_REMEMBER_STATE
-	jnz  retint_careful
+	movl	TI_flags(%rcx), %edx
+	andl	%edi, %edx
+	jnz	retint_careful
 	jmp	restore_c_regs_and_iret
 
 /* Returning to kernel space */
@@ -585,9 +466,9 @@ retint_kernel:
 #ifdef CONFIG_PREEMPT
 	/* Interrupts are off */
 	/* Check if we need preemption */
-	bt	$9,EFLAGS(%rsp)	/* interrupts were off? */
+	bt	$9, EFLAGS(%rsp)		/* were interrupts off? */
 	jnc	1f
-0:	cmpl	$0,PER_CPU_VAR(__preempt_count)
+0:	cmpl	$0, PER_CPU_VAR(__preempt_count)
 	jnz	1f
 	call	preempt_schedule_irq
 	jmp	0b
@@ -599,12 +480,12 @@ retint_kernel:
  * which come from interrupts/exception and from syscalls, merge.
  */
 restore_c_regs_and_iret:
-	movl EFLAGS(%rsp), %eax
-	shr $9, %eax			# EAX[0] == IRET_EFLAGS.IF
+	movl	EFLAGS(%rsp), %eax
+	shr	$9, %eax			/* EAX[0] == IRET_EFLAGS.IF */
 	GET_VCPU_INFO
-	andb evtchn_upcall_mask(%rsi),%al
-	andb $1,%al			# EAX[0] == IRET_EFLAGS.IF & event_mask
-	jnz restore_all_enable_events	#        != 0 => enable event delivery
+	andb	evtchn_upcall_mask(%rsi), %al
+	andb	$1, %al				/* EAX[0] == IRET_EFLAGS.IF & event_mask */
+	jnz	restore_all_enable_events	/*        != 0 => enable event delivery */
 .Lrestore_c_regs_and_iret:
 	/*
 	 * The iretq could re-enable interrupts:
@@ -613,39 +494,37 @@ restore_c_regs_and_iret:
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
 	HYPERVISOR_IRET 0
-	
+
 	/* edi: workmask, edx: work */
 retint_careful:
-	CFI_RESTORE_STATE
-	bt    $TIF_NEED_RESCHED,%edx
-	jnc   retint_signal
+	bt	$TIF_NEED_RESCHED, %edx
+	jnc	retint_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq_cfi %rdi
+	pushq	%rdi
 	SCHEDULE_USER
-	popq_cfi %rdi
+	popq	%rdi
 	GET_THREAD_INFO(%rcx)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	jmp retint_check
+	jmp	retint_check
 
 retint_signal:
-	testl $_TIF_DO_NOTIFY_MASK,%edx
-	jz    restore_c_regs_and_iret
+	testl	$_TIF_DO_NOTIFY_MASK, %edx
+	jz	restore_c_regs_and_iret
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_EXTRA_REGS
-	movq $-1,ORIG_RAX(%rsp)
-	xorl %esi,%esi		# oldset
-	movq %rsp,%rdi		# &pt_regs
-	call do_notify_resume
+	movq	$-1, ORIG_RAX(%rsp)
+	xorl	%esi, %esi			/* oldset */
+	movq	%rsp, %rdi			/* &pt_regs */
+	call	do_notify_resume
 	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
-	jmp retint_with_reschedule
+	jmp	retint_with_reschedule
 
-	CFI_ENDPROC
 END(retint_check)
 
 /*
@@ -660,102 +539,89 @@ ENTRY(\sym)
 	.error "using shift_ist requires paranoid=1"
 	.endif
 
-	.if \has_error_code
-	XCPT_FRAME
-	.else
-	INTR_FRAME
-	.endif
-
 	ASM_CLAC
-	movq_cfi_restore 0,rcx
-	movq_cfi_restore 8,r11
+	movq	0(%rsp), %rcx
+	movq	8(%rsp), %r11
 
 	.if \has_error_code
 	ALLOC_PT_GPREGS_ON_STACK -2*8
 	.else
-	movq $-1,8(%rsp)	/* ORIG_RAX: no syscall to restart */
+	movq	$-1, 8(%rsp)			/* ORIG_RAX: no syscall to restart */
 	ALLOC_PT_GPREGS_ON_STACK -1*8
 	.endif
 
 	.if \paranoid
 	.if \paranoid == 1
-	CFI_REMEMBER_STATE
-	testl $3, CS(%rsp)		/* If coming from userspace, switch */
-	jnz 1f				/* stacks. */
+	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */
+	jnz	1f
 	.endif
-	call paranoid_entry
+	call	paranoid_entry
 	.else
-	call error_entry
+	call	error_entry
 	.endif
 	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 
-	DEFAULT_FRAME 0
-
 	.if \paranoid
 	.if \shift_ist != -1
-	TRACE_IRQS_OFF_DEBUG		/* reload IDT in case of recursion */
+	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
 	.else
 	TRACE_IRQS_OFF
 	.endif
 	.endif
 
-	movq %rsp,%rdi			/* pt_regs pointer */
+	movq	%rsp, %rdi			/* pt_regs pointer */
 
 	.if \has_error_code
-	movq ORIG_RAX(%rsp),%rsi	/* get error code */
-	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
+	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
+	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
 	.else
-	xorl %esi,%esi			/* no error code */
+	xorl	%esi, %esi			/* no error code */
 	.endif
 
 	.if \shift_ist != -1
-	subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+	subq	$EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
 	.endif
 
-	call \do_sym
+	call	\do_sym
 
 	.if \shift_ist != -1
-	addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+	addq	$EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
 	.endif
 
 	/* these procedures expect "no swapgs" flag in ebx */
 	.if \paranoid
-	jmp paranoid_exit
+	jmp	paranoid_exit
 	.else
-	jmp error_exit
+	jmp	error_exit
 	.endif
 
 	.if \paranoid == 1
-	CFI_RESTORE_STATE
 	/*
 	 * Paranoid entry from userspace.  Switch stacks and treat it
 	 * as a normal entry.  This means that paranoid handlers
 	 * run in real process context if user_mode(regs).
 	 */
 1:
-	call error_entry
+	call	error_entry
 
-	DEFAULT_FRAME 0
 
-	movq %rsp,%rdi			/* pt_regs pointer */
-	call sync_regs
-	movq %rax,%rsp			/* switch stack */
+	movq	%rsp, %rdi			/* pt_regs pointer */
+	call	sync_regs
+	movq	%rax, %rsp			/* switch stack */
 
-	movq %rsp,%rdi			/* pt_regs pointer */
+	movq	%rsp, %rdi			/* pt_regs pointer */
 
 	.if \has_error_code
-	movq ORIG_RAX(%rsp),%rsi	/* get error code */
-	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
+	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
+	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
 	.else
-	xorl %esi,%esi			/* no error code */
+	xorl	%esi, %esi			/* no error code */
 	.endif
 
-	call \do_sym
+	call	\do_sym
 
-	jmp error_exit			/* %ebx: no swapgs flag */
+	jmp	error_exit			/* %ebx: no swapgs flag */
 	.endif
-
-	CFI_ENDPROC
 END(\sym)
 .endm
 
@@ -785,41 +651,33 @@ idtentry \sym \do_sym has_error_code=\ha
 # existing activation in its critical region -- if so, we pop the current
 # activation and restart the handler using the previous one.
 ENTRY(do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
-	CFI_STARTPROC
 # Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
 # see the correct pointer to the pt_regs
-	movq %rdi, %rsp            # we don't return, adjust the stack frame
-	CFI_ENDPROC
-	DEFAULT_FRAME
-11:	incl PER_CPU_VAR(irq_count)
-	movq %rsp,%rbp
-	CFI_DEF_CFA_REGISTER rbp
-	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
-	pushq %rbp			# backlink for old unwinder
-	call evtchn_do_upcall
-	popq %rsp
-	CFI_DEF_CFA_REGISTER rsp
-	decl PER_CPU_VAR(irq_count)
-	jmp  error_exit
-	CFI_ENDPROC
+	movq	%rdi, %rsp		# we don't return, adjust the stack frame
+11:	incl	PER_CPU_VAR(irq_count)
+	movq	%rsp, %rbp
+	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
+	pushq	%rbp			# backlink for old unwinder
+	call	evtchn_do_upcall
+	popq 	%rsp
+	decl	PER_CPU_VAR(irq_count)
+	jmp	error_exit
 END(do_hypervisor_callback)
 
         ALIGN
 restore_all_enable_events:  
-	DEFAULT_FRAME extra=0
 	TRACE_IRQS_ON
 	__ENABLE_INTERRUPTS
 
 .Lscrit: /**** START OF CRITICAL REGION ****/
 	__TEST_PENDING
-	jz .Lrestore_c_regs_and_iret
+	jz	.Lrestore_c_regs_and_iret
 
 	__DISABLE_INTERRUPTS
 .Lecrit: /**** END OF CRITICAL REGION ****/
 	SAVE_EXTRA_REGS
-        movq %rsp,%rdi                  # set the argument again
-	jmp  11b
-	CFI_ENDPROC
+        movq	%rsp, %rdi		# set the argument again
+	jmp	11b
 # At this point, unlike on x86-32, we don't do the fixup to simplify the 
 # code and the stack frame is more complex on x86-64.
 # When the kernel is interrupted in the critical section, the kernel 
@@ -838,85 +696,75 @@ restore_all_enable_events:
 # We distinguish between categories by comparing each saved segment register
 # with its current contents: any discrepancy means we in category 1.
 ENTRY(failsafe_callback)
-	INTR_FRAME offset=4*8
 	ASM_CLAC
-	movw %ds,%cx
-	cmpw %cx,0x10(%rsp)
-	CFI_REMEMBER_STATE
-	jne 1f
-	movw %es,%cx
-	cmpw %cx,0x18(%rsp)
-	jne 1f
-	movw %fs,%cx
-	cmpw %cx,0x20(%rsp)
-	jne 1f
-	movw %gs,%cx
-	cmpw %cx,0x28(%rsp)
-	jne 1f
+	movw	%ds, %cx
+	cmpw	%cx, 0x10(%rsp)
+	jne	1f
+	movw	%es, %cx
+	cmpw	%cx, 0x18(%rsp)
+	jne	1f
+	movw	%fs, %cx
+	cmpw	%cx, 0x20(%rsp)
+	jne	1f
+	movw	%gs, %cx
+	cmpw	%cx, 0x28(%rsp)
+	jne	1f
 	/* All segments match their saved values => Category 2 (Bad IRET). */
-	movq_cfi_restore 0,rcx
-	movq_cfi_restore 8,r11
-	addq $0x30,%rsp
-	CFI_ADJUST_CFA_OFFSET -0x30
-	movq $11,%rdi	/* SIGSEGV */
-	jmp do_exit			
-	CFI_RESTORE_STATE
+	movq	0(%rsp), %rcx
+	movq	8(%rsp), %r11
+	addq	$0x30, %rsp
+	movq	$11, %rdi		/* SIGSEGV */
+	jmp	do_exit
 1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
-	movq_cfi_restore 0,rcx
-	movq_cfi_restore 8,r11
-	addq $0x30,%rsp
-	CFI_ADJUST_CFA_OFFSET -0x30
-	pushq_cfi $-1
+	movq	0(%rsp), %rcx
+	movq	8(%rsp), %r11
+	addq	$0x30, %rsp
+	pushq	$-1
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
-	jmp error_exit
-	CFI_ENDPROC
+	jmp	error_exit
 
-idtentry divide_error do_divide_error has_error_code=0
-idtentry overflow do_overflow has_error_code=0
-idtentry bounds do_bounds has_error_code=0
-idtentry invalid_op do_invalid_op has_error_code=0
-idtentry device_not_available do_device_not_available has_error_code=0
-idtentry hypervisor_callback do_hypervisor_callback has_error_code=0
-idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
-idtentry invalid_TSS do_invalid_TSS has_error_code=1
-idtentry segment_not_present do_segment_not_present has_error_code=1
-idtentry coprocessor_error do_coprocessor_error has_error_code=0
-idtentry alignment_check do_alignment_check has_error_code=1
-idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
+idtentry divide_error			do_divide_error			has_error_code=0
+idtentry overflow			do_overflow			has_error_code=0
+idtentry bounds				do_bounds			has_error_code=0
+idtentry invalid_op			do_invalid_op			has_error_code=0
+idtentry device_not_available		do_device_not_available		has_error_code=0
+idtentry hypervisor_callback		do_hypervisor_callback		has_error_code=0
+idtentry coprocessor_segment_overrun	do_coprocessor_segment_overrun	has_error_code=0
+idtentry invalid_TSS			do_invalid_TSS			has_error_code=1
+idtentry segment_not_present		do_segment_not_present		has_error_code=1
+idtentry coprocessor_error		do_coprocessor_error		has_error_code=0
+idtentry alignment_check		do_alignment_check		has_error_code=1
+idtentry simd_coprocessor_error		do_simd_coprocessor_error	has_error_code=0
 
 /* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(do_softirq_own_stack)
-	CFI_STARTPROC
-	pushq_cfi %rbp
-	CFI_REL_OFFSET rbp,0
-	mov  %rsp,%rbp
-	CFI_DEF_CFA_REGISTER rbp
-	incl PER_CPU_VAR(irq_count)
-	cmove PER_CPU_VAR(irq_stack_ptr),%rsp
-	push  %rbp			# backlink for old unwinder
-	call __do_softirq
+	pushq	%rbp
+	mov	%rsp, %rbp
+	incl	PER_CPU_VAR(irq_count)
+	cmove	PER_CPU_VAR(irq_stack_ptr), %rsp
+	push	%rbp				/* frame pointer backlink */
+	call	__do_softirq
 	leaveq
-	CFI_RESTORE		rbp
-	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET   -8
-	decl PER_CPU_VAR(irq_count)
+	decl	PER_CPU_VAR(irq_count)
 	ret
-	CFI_ENDPROC
 END(do_softirq_own_stack)
 
-idtentry debug do_debug has_error_code=0
-idtentry nmi do_nmi_callback has_error_code=0
-idtentry int3 do_int3 has_error_code=0
-idtentry stack_segment do_stack_segment has_error_code=1
-idtentry general_protection do_general_protection has_error_code=1
-trace_idtentry page_fault do_page_fault has_error_code=1
+idtentry debug			do_debug		has_error_code=0
+idtentry nmi			do_nmi_callback		has_error_code=0
+idtentry int3			do_int3			has_error_code=0
+idtentry stack_segment		do_stack_segment	has_error_code=1
+
+idtentry general_protection	do_general_protection	has_error_code=1
+trace_idtentry page_fault	do_page_fault		has_error_code=1
+
 #ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault do_async_page_fault has_error_code=1
+idtentry async_page_fault	do_async_page_fault	has_error_code=1
 #endif
+
 #ifdef CONFIG_X86_MCE
-idtentry machine_check has_error_code=0 do_sym=*machine_check_vector(%rip)
+idtentry machine_check					has_error_code=0	do_sym=*machine_check_vector(%rip)
 #endif
 
 #ifndef CONFIG_XEN
@@ -926,19 +774,17 @@ idtentry machine_check has_error_code=0
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(paranoid_entry)
-	XCPT_FRAME 1 15*8
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
-	movl $1,%ebx
-	movl $MSR_GS_BASE,%ecx
+	movl	$1, %ebx
+	movl	$MSR_GS_BASE, %ecx
 	rdmsr
-	testl %edx,%edx
-	js 1f	/* negative -> in kernel */
+	testl	%edx, %edx
+	js	1f				/* negative -> in kernel */
 	SWAPGS
-	xorl %ebx,%ebx
+	xorl	%ebx, %ebx
 1:	ret
-	CFI_ENDPROC
 END(paranoid_entry)
 
 /*
@@ -950,17 +796,17 @@ END(paranoid_entry)
  * in syscall entry), so checking for preemption here would
  * be complicated.  Fortunately, we there's no good reason
  * to try to handle preemption here.
+ *
+ * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
  */
-/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
 ENTRY(paranoid_exit)
-	DEFAULT_FRAME
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF_DEBUG
-	testl %ebx,%ebx				/* swapgs needed? */
-	jnz paranoid_exit_no_swapgs
+	testl	%ebx, %ebx			/* swapgs needed? */
+	jnz	paranoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
 	SWAPGS_UNSAFE_STACK
-	jmp paranoid_exit_restore
+	jmp	paranoid_exit_restore
 paranoid_exit_no_swapgs:
 	TRACE_IRQS_IRETQ_DEBUG
 paranoid_exit_restore:
@@ -968,26 +814,26 @@ paranoid_exit_restore:
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
 	INTERRUPT_RETURN
-	CFI_ENDPROC
 END(paranoid_exit)
 #endif
 
 /*
  * Save all registers in pt_regs, and switch gs if needed.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ * Return: EBX=0: came from user mode; EBX=1: otherwise
  */
 ENTRY(error_entry)
-	XCPT_FRAME 2 15*8
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
 #ifndef CONFIG_XEN
-	xorl %ebx,%ebx
-	testl $3,CS+8(%rsp)
-	je error_kernelspace
-error_swapgs:
+	xorl	%ebx, %ebx
+	testb	$3, CS+8(%rsp)
+	jz	error_kernelspace
+
+	/* We entered from user mode */
 	SWAPGS
-error_sti:
+
+error_entry_done:
 #endif
 	TRACE_IRQS_OFF
 	ret
@@ -1000,53 +846,59 @@ error_sti:
 	 * for these here too.
 	 */
 error_kernelspace:
-	CFI_REL_OFFSET rcx, RCX+8
-	incl %ebx
-	leaq native_irq_return_iret(%rip),%rcx
-	cmpq %rcx,RIP+8(%rsp)
-	je error_bad_iret
-	movl %ecx,%eax	/* zero extend */
-	cmpq %rax,RIP+8(%rsp)
-	je bstep_iret
-	cmpq $gs_change,RIP+8(%rsp)
-	je error_swapgs
-	jmp error_sti
+	incl	%ebx
+	leaq	native_irq_return_iret(%rip), %rcx
+	cmpq	%rcx, RIP+8(%rsp)
+	je	error_bad_iret
+	movl	%ecx, %eax			/* zero extend */
+	cmpq	%rax, RIP+8(%rsp)
+	je	bstep_iret
+	cmpq	$gs_change, RIP+8(%rsp)
+	jne	error_entry_done
+
+	/*
+	 * hack: gs_change can fail with user gsbase.  If this happens, fix up
+	 * gsbase and proceed.  We'll fix up the exception and land in
+	 * gs_change's error handler with kernel gsbase.
+	 */
+	SWAPGS
+	jmp	error_entry_done
 
 bstep_iret:
 	/* Fix truncated RIP */
-	movq %rcx,RIP+8(%rsp)
+	movq	%rcx, RIP+8(%rsp)
 	/* fall through */
 
 error_bad_iret:
+	/*
+	 * We came from an IRET to user mode, so we have user gsbase.
+	 * Switch to kernel gsbase:
+	 */
 	SWAPGS
-	mov %rsp,%rdi
-	call fixup_bad_iret
-	mov %rax,%rsp
-	decl %ebx	/* Return to usergs */
-	jmp error_sti
+
+	/*
+	 * Pretend that the exception came from user mode: set up pt_regs
+	 * as if we faulted immediately after IRET and clear EBX so that
+	 * error_exit knows that we will be returning to user mode.
+	 */
+	mov	%rsp, %rdi
+	call	fixup_bad_iret
+	mov	%rax, %rsp
+	decl	%ebx
+	jmp	error_entry_done
 #endif
-	CFI_ENDPROC
 END(error_entry)
 
 
 ENTRY(error_exit)
-	DEFAULT_FRAME
 	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	GET_THREAD_INFO(%rcx)
-	testb $3,CS(%rsp)
-	jz retint_kernel
-	LOCKDEP_SYS_EXIT_IRQ
-	movl TI_flags(%rcx),%edx
-	movl $_TIF_WORK_MASK,%edi
-	andl %edi,%edx
-	jnz retint_careful
-	jmp restore_c_regs_and_iret
-	CFI_ENDPROC
+	testb	$3, CS(%rsp)
+	jz	retint_kernel
+	jmp	retint_user
 END(error_exit)
 
-
 #define extern #
 #include <asm-generic/percpu.h>
 
@@ -1055,38 +907,29 @@ in_NMI:	.byte	0
 .popsection
 
 do_nmi_callback:
-	CFI_STARTPROC
-	addq $8, %rsp
-	CFI_ENDPROC
-	DEFAULT_FRAME
-	orb  $1, PER_CPU_VAR(in_NMI)
-	js   1f
+	addq	$8, %rsp
+	orb	$1, PER_CPU_VAR(in_NMI)
+	js	1f
 0:
-	movb $0x80, PER_CPU_VAR(in_NMI)
-	call do_nmi
-	movl $0x80, %eax
+	movb	$0x80, PER_CPU_VAR(in_NMI)
+	call	do_nmi
+	movl	$0x80, %eax
 	cmpxchgb %ah, PER_CPU_VAR(in_NMI)
-	jne  0b
-	orl  $NMI_MASK,EFLAGS(%rsp)
+	jne 	0b
+	orl	$NMI_MASK, EFLAGS(%rsp)
 1:
 	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
-	jmp  restore_c_regs_and_iret
-	CFI_ENDPROC
+	jmp	restore_c_regs_and_iret
 END(do_nmi_callback)
 
-
 #ifndef CONFIG_IA32_EMULATION
 ENTRY(ignore_sysret)
-	INTR_FRAME
-	popq_cfi_reg rcx
-	popq_cfi_reg r11
-	mov $-ENOSYS,%eax
-	# any non-zero value not having VGCF_in_syscall set will do:
+	popq	%rcx
+	popq	%r11
+	mov	$-ENOSYS, %eax
 	HYPERVISOR_IRET VGCF_i387_valid
-	CFI_ENDPROC
 END(ignore_sysret)
 #endif
-
--- a/arch/x86/entry/entry_64_compat-xen.S
+++ b/arch/x86/entry/entry_64_compat-xen.S
@@ -1,16 +1,14 @@
 /*
- * Compatibility mode system call entry point for x86-64. 
- * 		
+ * Compatibility mode system call entry point for x86-64.
+ *
  * Copyright 2000-2002 Andi Kleen, SuSE Labs.
- */		 
-
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
+ */
+#include "calling.h"
 #include <asm/asm-offsets.h>
 #include <asm/current.h>
 #include <asm/errno.h>
-#include <asm/ia32_unistd.h>	
-#include <asm/thread_info.h>	
+#include <asm/ia32_unistd.h>
+#include <asm/thread_info.h>
 #include <asm/segment.h>
 #include <asm/irqflags.h>
 #include <asm/asm.h>
@@ -21,67 +19,19 @@
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
 #define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE	   0x40000000
+#define __AUDIT_ARCH_LE		0x40000000
 
 	.section .entry.text, "ax"
 
-	/* clobbers %rax */
-	.macro  CLEAR_RREGS _r9=rax
-	xorl 	%eax,%eax
-	movq	%rax,R11(%rsp)
-	movq	%rax,R10(%rsp)
-	movq	%\_r9,R9(%rsp)
-	movq	%rax,R8(%rsp)
-	.endm
-
-	/*
-	 * Reload arg registers from stack in case ptrace changed them.
-	 * We don't reload %eax because syscall_trace_enter() returned
-	 * the %rax value we should see.  Instead, we just truncate that
-	 * value to 32 bits again as we did on entry from user mode.
-	 * If it's a new value set by user_regset during entry tracing,
-	 * this matches the normal truncation of the user-mode value.
-	 * If it's -1 to make us punt the syscall, then (u32)-1 is still
-	 * an appropriately invalid value.
-	 */
-	.macro LOAD_ARGS32 _r9=0
-	.if \_r9
-	movl R9(%rsp),%r9d
-	.endif
-	movl RCX(%rsp),%ecx
-	movl RDX(%rsp),%edx
-	movl RSI(%rsp),%esi
-	movl RDI(%rsp),%edi
-	movl %eax,%eax			/* zero extension */
-	.endm
-
-	.macro CFI_STARTPROC32 simple
-	CFI_STARTPROC	\simple
-	CFI_UNDEFINED	r8
-	CFI_UNDEFINED	r9
-	CFI_UNDEFINED	r10
-	CFI_UNDEFINED	r11
-	CFI_UNDEFINED	r12
-	CFI_UNDEFINED	r13
-	CFI_UNDEFINED	r14
-	CFI_UNDEFINED	r15
-	.endm
-
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_usergs_sysret32)
 	swapgs
 	sysretl
 ENDPROC(native_usergs_sysret32)
-
-ENTRY(native_irq_enable_sysexit)
-	swapgs
-	sti
-	sysexit
-ENDPROC(native_irq_enable_sysexit)
 #endif
 
 /*
- * 32bit SYSENTER instruction entry.
+ * 32-bit SYSENTER instruction entry.
  *
  * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
  * IF and VM in rflags are cleared (IOW: interrupts are off).
@@ -102,94 +52,104 @@ ENDPROC(native_irq_enable_sysexit)
  * path below. We set up a complete hardware stack frame to share code
  * with the int 0x80 path.
  */
-ENTRY(ia32_sysenter_target)
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SS+8-RIP+16
-	/*CFI_REL_OFFSET	ss,SS-RIP+16*/
-	CFI_REL_OFFSET	rsp,RSP-RIP+16
-	/*CFI_REL_OFFSET	rflags,EFLAGS-RIP+16*/
-	/*CFI_REL_OFFSET	cs,CS-RIP+16*/
-	CFI_REL_OFFSET	rip,RIP-RIP+16
-	CFI_REL_OFFSET	r11,8
-	CFI_REL_OFFSET	rcx,0
+ENTRY(entry_SYSENTER_compat)
 	movq	8(%rsp),%r11
-	CFI_RESTORE	r11
-	popq_cfi %rcx
-	CFI_RESTORE	rcx
+	popq	%rcx
 
 	/* Zero-extending 32-bit regs, do not remove */
 	movl	%ebp, %ebp
 	movl	%eax, %eax
 	movl	ASM_THREAD_INFO(TI_sysenter_return, %rsp, 6*8), %r10d
-	CFI_REGISTER rip,r10
 	movl	$__USER32_DS,40(%rsp)
 	movq	%rbp,32(%rsp)
 	movl	$__USER32_CS,16(%rsp)
 	movq	%r10,8(%rsp)
 
 	/* Construct struct pt_regs on stack */
-	movq	%rax,(%rsp)			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi_reg	rax			/* pt_regs->ax */
+	movq	%rax,(%rsp)		/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rcx			/* pt_regs->cx */
+	pushq	$-ENOSYS		/* pt_regs->ax */
 	cld
-	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 10*8
+	sub	$(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */
 
 	/*
 	 * no need to do an access_ok check here because rbp has been
-	 * 32bit zero extended
+	 * 32-bit zero extended
 	 */
 	ASM_STAC
-1:	movl	(%rbp),%ebp
-	_ASM_EXTABLE(1b,ia32_badarg)
+1:	movl	(%rbp), %ebp
+	_ASM_EXTABLE(1b, ia32_badarg)
 	ASM_CLAC
 	orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz  sysenter_tracesys
-	jmp .Lia32_check_call
+	jnz	sysenter_tracesys
+	jmp	ia32_do_call
 
 #ifdef CONFIG_AUDITSYSCALL
 	.macro auditsys_entry_common
-	movl %esi,%r8d			/* 5th arg: 4th syscall arg */
-	movl %ecx,%r9d			/*swap with edx*/
-	movl %edx,%ecx			/* 4th arg: 3rd syscall arg */
-	movl %r9d,%edx			/* 3rd arg: 2nd syscall arg */
-	movl %ebx,%esi			/* 2nd arg: 1st syscall arg */
-	movl %eax,%edi			/* 1st arg: syscall number */
-	call __audit_syscall_entry
-	movl RAX(%rsp),%eax	/* reload syscall number */
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja ia32_badsys
-	movl %ebx,%edi			/* reload 1st syscall arg */
-	movl RCX(%rsp),%esi	/* reload 2nd syscall arg */
-	movl RDX(%rsp),%edx	/* reload 3rd syscall arg */
-	movl RSI(%rsp),%ecx	/* reload 4th syscall arg */
-	movl RDI(%rsp),%r8d	/* reload 5th syscall arg */
+	/*
+	 * At this point, registers hold syscall args in the 32-bit syscall ABI:
+	 * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP.
+	 *
+	 * We want to pass them to __audit_syscall_entry(), which is a 64-bit
+	 * C function with 5 parameters, so shuffle them to match what
+	 * the function expects: RDI,RSI,RDX,RCX,R8.
+	 */
+	movl	%esi, %r8d		/* arg5 (R8 ) <= 4th syscall arg (ESI) */
+	xchg	%ecx, %edx		/* arg4 (RCX) <= 3rd syscall arg (EDX) */
+					/* arg3 (RDX) <= 2nd syscall arg (ECX) */
+	movl	%ebx, %esi		/* arg2 (RSI) <= 1st syscall arg (EBX) */
+	movl	%eax, %edi		/* arg1 (RDI) <= syscall number  (EAX) */
+	call	__audit_syscall_entry
+
+	/*
+	 * We are going to jump back to the syscall dispatch code.
+	 * Prepare syscall args as required by the 64-bit C ABI.
+	 * Registers clobbered by __audit_syscall_entry() are
+	 * loaded from pt_regs on stack:
+	 */
+	movl	ORIG_RAX(%rsp), %eax	/* syscall number */
+	movl	%ebx, %edi		/* arg1 */
+	movl	RCX(%rsp), %esi		/* arg2 */
+	movl	RDX(%rsp), %edx		/* arg3 */
+	movl	RSI(%rsp), %ecx		/* arg4 */
+	movl	RDI(%rsp), %r8d		/* arg5 */
 	.endm
 
 sysenter_auditsys:
 	auditsys_entry_common
-	movl %ebp,%r9d			/* reload 6th syscall arg */
-	jmp .Lia32_dispatch
+	movl	%ebp, %r9d		/* reload 6th syscall arg */
+	jmp	.Lia32_dispatch
+#endif
+
+sysenter_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jz	sysenter_auditsys
 #endif
-	CFI_ENDPROC
-ENDPROC(ia32_sysenter_target)
+	SAVE_EXTRA_REGS
+	xorl	%eax, %eax		/* Do not leak kernel information */
+	movq	%rax, R11(%rsp)
+	movq	%rax, R10(%rsp)
+	movq	%rax, R9(%rsp)
+	movq	%rax, R8(%rsp)
+	jmp	.Ltracesys
+ENDPROC(entry_SYSENTER_compat)
 
 /*
- * 32bit SYSCALL instruction entry.
+ * 32-bit SYSCALL instruction entry.
  *
- * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
  * then loads new ss, cs, and rip from previously programmed MSRs.
  * rflags gets masked by a value from another MSR (so CLD and CLAC
  * are not needed). SYSCALL does not save anything on the stack
  * and does not change rsp.
  *
  * Note: rflags saving+masking-with-MSR happens only in Long mode
- * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
+ * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
  * Don't get confused: rflags saving+masking depends on Long Mode Active bit
  * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
  * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
@@ -209,84 +169,86 @@ ENDPROC(ia32_sysenter_target)
  * path below. We set up a complete hardware stack frame to share code
  * with the int 0x80 path.
  */
-ENTRY(ia32_cstar_target)
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SS+8-RIP+16
-	/*CFI_REL_OFFSET	ss,SS-RIP+16*/
-	CFI_REL_OFFSET	rsp,RSP-RIP+16
-	/*CFI_REL_OFFSET	rflags,EFLAGS-RIP+16*/
-	/*CFI_REL_OFFSET	cs,CS-RIP+16*/
-	CFI_REL_OFFSET	rip,RIP-RIP+16
-	movl	RSP-RIP+16(%rsp),%r8d
+ENTRY(entry_SYSCALL_compat)
+	movl	RSP-RIP+16(%rsp), %r8d
 
 	/* Zero-extending 32-bit regs, do not remove */
-	movl	%eax,%eax
+	movl	%eax, %eax
 
 	/* Construct struct pt_regs on stack */
-	movl		$__USER32_DS,6*8(%rsp)	/* pt_regs->ss */
-	movl		$__USER32_CS,3*8(%rsp)	/* pt_regs->cs */
-	movq_cfi	rax,8			/* pt_regs->orig_ax */
-	movq_cfi	rdi,0			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rbp			/* pt_regs->cx */
-	movl	%ebp,%ecx
-	pushq_cfi	%rax			/* pt_regs->ax */
-	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 10*8
+	movl	$__USER32_DS,6*8(%rsp)	/* pt_regs->ss */
+	movl	$__USER32_CS,3*8(%rsp)	/* pt_regs->cs */
+	movq	%rax, 8(%rsp)		/* pt_regs->orig_ax */
+	movq	%rdi, 0(%rsp)		/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rbp			/* pt_regs->cx */
+	movl	%ebp, %ecx
+	pushq	$-ENOSYS		/* pt_regs->ax */
+	sub	$(10*8), %rsp		/* pt_regs->r8-11, bp, bx, r12-15 not saved */
 
 	/*
-	 * no need to do an access_ok check here because r8 has been
-	 * 32bit zero extended
+	 * No need to do an access_ok check here because r8 has been
+	 * 32-bit zero extended:
 	 */
 	ASM_STAC
-1:	movl	(%r8),%r9d
-	_ASM_EXTABLE(1b,ia32_badarg)
+1:	movl	(%r8), %r9d
+	_ASM_EXTABLE(1b, ia32_badarg)
 	ASM_CLAC
-	orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz   cstar_tracesys
-	cmpq $IA32_NR_syscalls-1,%rax
-	ja  ia32_badsys
+	orl	$TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	cstar_tracesys
 cstar_do_call:
-	/* 32bit syscall -> 64bit C ABI argument conversion */
-	movl	%edi,%r8d	/* arg5 */
-	/* r9 already loaded */	/* arg6 */
+	/* 32-bit syscall -> 64-bit C ABI argument conversion */
+	movl	%edi, %r8d		/* arg5 */
+	/* r9 already loaded */		/* arg6 */
 	jmp	.Lia32_arg_fixup_common
-	
+
 #ifdef CONFIG_AUDITSYSCALL
 cstar_auditsys:
-	movl %r9d,R9(%rsp)	/* register to be clobbered by call */
+	movl	%r9d, R9(%rsp)		/* register to be clobbered by call */
 	auditsys_entry_common
-	movl R9(%rsp),%r9d	/* reload 6th syscall arg */
-	jmp .Lia32_dispatch
+	movl	R9(%rsp), %r9d		/* reload 6th syscall arg */
+	jmp	.Lia32_dispatch
 #endif
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jz cstar_auditsys
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jz	cstar_auditsys
 #endif
-	xchgl %r9d,%ebp
+	xchgl	%r9d, %ebp
 	SAVE_EXTRA_REGS
-	CLEAR_RREGS r9
-	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
-	movq %rsp,%rdi        /* &pt_regs -> arg1 */
-	call syscall_trace_enter
-	LOAD_ARGS32 1	/* reload args from stack in case ptrace changed it */
+	xorl	%eax, %eax		/* Do not leak kernel information */
+	movq	%rax, R11(%rsp)
+	movq	%rax, R10(%rsp)
+	movq	%r9, R9(%rsp)
+	movq	%rax, R8(%rsp)
+	movq	%rsp, %rdi		/* &pt_regs -> arg1 */
+	call	syscall_trace_enter
+	movl	R9(%rsp), %r9d
+
+	/* Reload arg registers from stack. (see sysenter_tracesys) */
+	movl	RCX(%rsp), %ecx
+	movl	RDX(%rsp), %edx
+	movl	RSI(%rsp), %esi
+	movl	RDI(%rsp), %edi
+	movl	%eax, %eax		/* zero extension */
+
 	RESTORE_EXTRA_REGS
-	xchgl %ebp,%r9d
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
-	jmp cstar_do_call
-END(ia32_cstar_target)
-				
+	xchgl	%ebp, %r9d
+	jmp	cstar_do_call
+END(entry_SYSCALL_compat)
+
 ia32_badarg:
 	ASM_CLAC
-	movq $-EFAULT,%rax
-	jmp ia32_sysret
-	CFI_ENDPROC
+	movq	$-EFAULT, RAX(%rsp)
+	xorl	%eax, %eax		/* Do not leak kernel information */
+	movq	%rax, R11(%rsp)
+	movq	%rax, R10(%rsp)
+	movq	%rax, R9(%rsp)
+	movq	%rax, R8(%rsp)
+	jmp	int_ret_from_sys_call
 
 /*
  * Emulated IA32 system calls via int 0x80.
@@ -309,121 +271,98 @@ ia32_badarg:
  * Assumes it is only called from user space and entered with interrupts on.
  */
 
-ENTRY(ia32_syscall)
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,5*8+16
-	/*CFI_REL_OFFSET	ss,4*8+16 */
-	CFI_REL_OFFSET	rsp,3*8+16
-	/*CFI_REL_OFFSET	rflags,2*8+16 */
-	/*CFI_REL_OFFSET	cs,1*8+16 */
-	CFI_REL_OFFSET	rip,0*8+16
-	CFI_REL_OFFSET	r11,8
-	CFI_REL_OFFSET	rcx,0
-	movq 8(%rsp),%r11
-	CFI_RESTORE	r11
-	popq_cfi %rcx
-	CFI_RESTORE	rcx
+ENTRY(entry_INT80_compat)
+	movq	8(%rsp),%r11
+	popq	%rcx
 
 	/* Zero-extending 32-bit regs, do not remove */
-	movl	%eax,%eax
+	movl	%eax, %eax
 
 	/* Construct struct pt_regs on stack (iret frame is already on stack) */
-	movq		%rax,(%rsp)		/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi_reg	rax			/* pt_regs->ax */
+	movq	%rax,(%rsp)		/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rcx			/* pt_regs->cx */
+	pushq	$-ENOSYS		/* pt_regs->ax */
+	pushq	$0			/* pt_regs->r8 */
+	pushq	$0			/* pt_regs->r9 */
+	pushq	$0			/* pt_regs->r10 */
+	pushq	$0			/* pt_regs->r11 */
 	cld
-	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 10*8
+	sub	$(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
+
+	orl	$TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	ia32_tracesys
 
-	orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz ia32_tracesys
-.Lia32_check_call:
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja ia32_badsys
 ia32_do_call:
-	/* 32bit syscall -> 64bit C ABI argument conversion */
-	movl %edi,%r8d	/* arg5 */
-	movl %ebp,%r9d	/* arg6 */
+	/* 32-bit syscall -> 64-bit C ABI argument conversion */
+	movl	%edi, %r8d		/* arg5 */
+	movl	%ebp, %r9d		/* arg6 */
 .Lia32_arg_fixup_common:
-	xchg %ecx,%esi	/* rsi:arg2, rcx:arg4 */
-	movl %ebx,%edi	/* arg1 */
-	movl %edx,%edx	/* arg3 (zero extension) */
+	xchg	%ecx, %esi		/* rsi:arg2, rcx:arg4 */
+	movl	%ebx, %edi		/* arg1 */
+	movl	%edx, %edx		/* arg3 (zero extension) */
 .Lia32_dispatch:
-	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
-ia32_sysret:
-	movq %rax,RAX(%rsp)
-	CLEAR_RREGS
-	jmp int_ret_from_sys_call
+	cmpq	$(IA32_NR_syscalls-1), %rax
+	ja	1f
+
+	call	*ia32_sys_call_table(, %rax, 8)
+	movq	%rax, RAX(%rsp)
+1:
+	jmp	int_ret_from_sys_call
 
-sysenter_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jz sysenter_auditsys
-#endif
 ia32_tracesys:
 	SAVE_EXTRA_REGS
-	CLEAR_RREGS
-	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
-	movq %rsp,%rdi        /* &pt_regs -> arg1 */
-	call syscall_trace_enter
-	LOAD_ARGS32	/* reload args from stack in case ptrace changed it */
+.Ltracesys:
+	movq	%rsp, %rdi			/* &pt_regs -> arg1 */
+	call	syscall_trace_enter
+	/*
+	 * Reload arg registers from stack in case ptrace changed them.
+	 * Don't reload %eax because syscall_trace_enter() returned
+	 * the %rax value we should see.  But do truncate it to 32 bits.
+	 * If it's -1 to make us punt the syscall, then (u32)-1 is still
+	 * an appropriately invalid value.
+	 */
+	movl	RCX(%rsp), %ecx
+	movl	RDX(%rsp), %edx
+	movl	RSI(%rsp), %esi
+	movl	RDI(%rsp), %edi
+	movl	%eax, %eax		/* zero extension */
 	RESTORE_EXTRA_REGS
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
-	jmp ia32_do_call
-END(ia32_syscall)
-
-ia32_badsys:
-	movq $0,ORIG_RAX(%rsp)
-	movq $-ENOSYS,%rax
-	jmp ia32_sysret
+	jmp	ia32_do_call
+END(entry_INT80_compat)
 
-	CFI_ENDPROC
-	
 	.macro PTREGSCALL label, func
 	ALIGN
 GLOBAL(\label)
-	leaq \func(%rip),%rax
-	jmp  ia32_ptregs_common	
+	leaq	\func(%rip), %rax
+	jmp	ia32_ptregs_common
 	.endm
 
-	CFI_STARTPROC32
-
-	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
-	PTREGSCALL stub32_sigreturn, sys32_sigreturn
-	PTREGSCALL stub32_fork, sys_fork
-	PTREGSCALL stub32_vfork, sys_vfork
+	PTREGSCALL stub32_rt_sigreturn,	sys32_rt_sigreturn
+	PTREGSCALL stub32_sigreturn,	sys32_sigreturn
+	PTREGSCALL stub32_fork,		sys_fork
+	PTREGSCALL stub32_vfork,	sys_vfork
 
 	ALIGN
 GLOBAL(stub32_clone)
-	leaq sys_clone(%rip),%rax
-	mov	%r8, %rcx
-	jmp  ia32_ptregs_common
+	leaq	sys_clone(%rip), %rax
+	/*
+	 * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
+	 * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
+	 *
+	 * The native 64-bit kernel's sys_clone() implements the latter,
+	 * so we need to swap arguments here before calling it:
+	 */
+	xchg	%r8, %rcx
+	jmp	ia32_ptregs_common
 
 	ALIGN
 ia32_ptregs_common:
-	CFI_ENDPROC
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SIZEOF_PTREGS
-	CFI_REL_OFFSET	rax,RAX
-	CFI_REL_OFFSET	rcx,RCX
-	CFI_REL_OFFSET	rdx,RDX
-	CFI_REL_OFFSET	rsi,RSI
-	CFI_REL_OFFSET	rdi,RDI
-	CFI_REL_OFFSET	rip,RIP
-/*	CFI_REL_OFFSET	cs,CS*/
-/*	CFI_REL_OFFSET	rflags,EFLAGS*/
-	CFI_REL_OFFSET	rsp,RSP
-/*	CFI_REL_OFFSET	ss,SS*/
 	SAVE_EXTRA_REGS 8
-	call *%rax
+	call	*%rax
 	RESTORE_EXTRA_REGS 8
 	ret
-	CFI_ENDPROC
 END(ia32_ptregs_common)
--- a/arch/x86/entry/syscall_32-xen.c
+++ b/arch/x86/entry/syscall_32-xen.c
@@ -9,12 +9,12 @@ extern asmlinkage void cstar_set_tif(voi
 #define	ptregs_clone cstar_set_tif
 #define	ptregs_vfork cstar_set_tif
 
-__visible const sys_call_ptr_t cstar_call_table[__NR_ia32_syscall_max+1] = {
+__visible const sys_call_ptr_t cstar_call_table[__NR_syscall_compat_max+1] = {
 	/*
 	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
 	 */
-	[0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
+	[0 ... __NR_syscall_compat_max] = &sys_ni_syscall,
 #include <asm/syscalls_32.h>
 };
 #endif /* TIF_CSTAR */
--- a/arch/x86/entry/vdso/vdso32-setup-xen.c
+++ b/arch/x86/entry/vdso/vdso32-setup-xen.c
@@ -75,11 +75,11 @@ int __init sysenter_setup(void)
 #ifdef CONFIG_X86_32
 	if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
 # ifdef TIF_CSTAR
-		extern asmlinkage void ia32pv_cstar_target(void);
+		extern asmlinkage void entry_SYSCALL_PV32(void);
 		static const struct callback_register __initconst cstar = {
 			.type = CALLBACKTYPE_syscall32,
 			.address = { __KERNEL_CS,
-			             (unsigned long)ia32pv_cstar_target },
+			             (unsigned long)entry_SYSCALL_PV32 },
 		};
 		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
 		    && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -10,7 +10,11 @@ enum {
 	X86_IRQ_ALLOC_CONTIGUOUS_VECTORS		= 0x1,
 };
 
+#ifndef CONFIG_XEN
 extern struct irq_domain *x86_vector_domain;
+#else
+#define x86_vector_domain xen_irq_domain
+#endif
 
 extern void init_irq_alloc_info(struct irq_alloc_info *info,
 				const struct cpumask *mask);
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -191,7 +191,11 @@ static inline unsigned long current_stac
 #else /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_64
+#ifndef CONFIG_XEN
 # define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
+#else
+# define cpu_current_top_of_stack cpu_sp0
+# endif
 #endif
 
 /* Load thread_info address into "reg" */
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -25,14 +25,8 @@
 #ifndef _ASM_X86_TOPOLOGY_H
 #define _ASM_X86_TOPOLOGY_H
 
-#ifdef CONFIG_X86_32
-# ifdef CONFIG_SMP
-#  define ENABLE_TOPO_DEFINES
-# endif
-#else
-# if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
-#  define ENABLE_TOPO_DEFINES
-# endif
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+# define ENABLE_TOPO_DEFINES
 #endif
 
 /*
--- a/arch/x86/include/mach-xen/asm/desc.h
+++ b/arch/x86/include/mach-xen/asm/desc.h
@@ -304,21 +304,6 @@ static inline void clear_LDT(void)
 	set_ldt(NULL, 0);
 }
 
-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT_nolock(mm_context_t *pc)
-{
-	set_ldt(pc->ldt, pc->size);
-}
-
-static inline void load_LDT(mm_context_t *pc)
-{
-	preempt_disable();
-	load_LDT_nolock(pc);
-	preempt_enable();
-}
-
 static inline unsigned long get_desc_base(const struct desc_struct *desc)
 {
 	return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
--- a/arch/x86/include/mach-xen/asm/fpu/internal.h
+++ b/arch/x86/include/mach-xen/asm/fpu/internal.h
@@ -1,26 +1,33 @@
-#ifndef _FPU_INTERNAL_H
-#include <asm/i387.h>
+#ifndef _ASM_X86_FPU_INTERNAL_H
 #define switch_fpu_prepare native_switch_fpu_prepare
-#include_next <asm/fpu-internal.h>
+#include_next <asm/fpu/internal.h>
 #undef switch_fpu_prepare
 
-static inline bool xen_thread_fpu_begin(struct task_struct *tsk,
-					multicall_entry_t *mcl)
+static inline void xen_fpregs_activate_hw(multicall_entry_t **mcl)
 {
-	bool ret = false;
+	if (!use_eager_fpu()) {
+		(*mcl)->op = __HYPERVISOR_fpu_taskswitch;
+		(*mcl)++->args[0] = 0;
+	}
+}
 
-	if (mcl && !use_eager_fpu()) {
-		mcl->op = __HYPERVISOR_fpu_taskswitch;
-		mcl->args[0] = 0;
-		ret = true;
+static inline void xen_fpregs_deactivate_hw(multicall_entry_t **mcl)
+{
+	if (!use_eager_fpu()) {
+		(*mcl)->op = __HYPERVISOR_fpu_taskswitch;
+		(*mcl)++->args[0] = 1;
 	}
-	__thread_set_has_fpu(tsk);
+}
 
-	return ret;
+static inline void xen_fpregs_activate(struct fpu *fpu,
+				       multicall_entry_t **mcl)
+{
+	xen_fpregs_activate_hw(mcl);
+	__fpregs_activate(fpu);
 }
 
-static inline fpu_switch_t xen_switch_fpu_prepare(struct task_struct *old,
-						  struct task_struct *new,
+static inline fpu_switch_t xen_switch_fpu_prepare(struct fpu *old_fpu,
+						  struct fpu *new_fpu,
 						  int cpu,
 						  multicall_entry_t **mcl)
 {
@@ -30,38 +37,36 @@ static inline fpu_switch_t xen_switch_fp
 	 * If the task has used the math, pre-load the FPU on xsave processors
 	 * or if the past 5 consecutive context-switches used math.
 	 */
-	fpu.preload = tsk_used_math(new) &&
-		      (use_eager_fpu() || new->thread.fpu_counter > 5);
+	fpu.preload = new_fpu->fpstate_active &&
+		      (use_eager_fpu() || new_fpu->counter > 5);
 
-	if (__thread_has_fpu(old)) {
-		if (!__save_init_fpu(old))
-			task_disable_lazy_fpu_restore(old);
+	if (old_fpu->fpregs_active) {
+		if (!copy_fpregs_to_fpstate(old_fpu))
+			old_fpu->last_cpu = -1;
 		else
-			old->thread.fpu.last_cpu = cpu;
+			old_fpu->last_cpu = cpu;
 
-		/* But leave fpu_owner_task! */
-		old->thread.fpu.has_fpu = 0;
+		/* But leave fpu_fpregs_owner_ctx! */
+		old_fpu->fpregs_active = 0;
 
 		/* Don't change CR0.TS if we just switch! */
 		if (fpu.preload) {
-			new->thread.fpu_counter++;
-			__thread_set_has_fpu(new);
-			prefetch(new->thread.fpu.state);
-		} else if (!use_eager_fpu()) {
-			(*mcl)->op = __HYPERVISOR_fpu_taskswitch;
-			(*mcl)++->args[0] = 1;
+			new_fpu->counter++;
+			__fpregs_activate(new_fpu);
+			prefetch(&new_fpu->state);
+		} else {
+			xen_fpregs_deactivate_hw(mcl);
 		}
 	} else {
-		old->thread.fpu_counter = 0;
-		task_disable_lazy_fpu_restore(old);
+		old_fpu->counter = 0;
+		old_fpu->last_cpu = -1;
 		if (fpu.preload) {
-			new->thread.fpu_counter++;
-			if (fpu_lazy_restore(new, cpu))
+			new_fpu->counter++;
+			if (fpu_want_lazy_restore(new_fpu, cpu))
 				fpu.preload = 0;
 			else
-				prefetch(new->thread.fpu.state);
-			if (xen_thread_fpu_begin(new, *mcl))
-				++*mcl;
+				prefetch(&new_fpu->state);
+			xen_fpregs_activate(new_fpu, mcl);
 		}
 	}
 	return fpu;
--- a/arch/x86/include/mach-xen/asm/io.h
+++ b/arch/x86/include/mach-xen/asm/io.h
@@ -35,11 +35,13 @@
   */
 
 #define ARCH_HAS_IOREMAP_WC
+#define ARCH_HAS_IOREMAP_WT
 
 #include <linux/string.h>
 #include <linux/compiler.h>
 #include <asm/page.h>
 #include <asm/early_ioremap.h>
+#include <asm/pgtable_types.h>
 #ifdef __KERNEL__
 #include <asm/fixmap.h>
 #endif
@@ -183,6 +185,7 @@ static inline void *phys_to_virt(phys_ad
  * look at pci_iomap().
  */
 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
 				unsigned long prot_val);
@@ -203,8 +206,6 @@ extern void set_iounmap_nonlazy(void);
 
 #include <asm-generic/iomap.h>
 
-#include <linux/vmalloc.h>
-
 /*
  * Convert a virtual cached pointer to an uncached pointer
  */
@@ -243,6 +244,12 @@ static inline void flush_write_buffers(v
 #endif
 }
 
+static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
+	unsigned long size)
+{
+	return (void __force __pmem *) ioremap_cache(offset, size);
+}
+
 #endif /* __KERNEL__ */
 
 extern void native_io_delay(void);
@@ -325,12 +332,16 @@ extern void unxlate_dev_mem_ptr(phys_add
 extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
 				     enum page_cache_mode pcm);
 extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
 
 extern bool is_early_ioremap_ptep(pte_t *ptep);
 
 #define IO_SPACE_LIMIT 0xffff
 
 #ifdef CONFIG_MTRR
+extern int __must_check arch_phys_wc_index(int handle);
+#define arch_phys_wc_index arch_phys_wc_index
+
 extern int __must_check arch_phys_wc_add(unsigned long base,
 					 unsigned long size);
 extern void arch_phys_wc_del(int handle);
--- a/arch/x86/include/mach-xen/asm/irq_vectors.h
+++ b/arch/x86/include/mach-xen/asm/irq_vectors.h
@@ -4,9 +4,6 @@
 #define MCE_VECTOR			0x12
 
 #define IA32_SYSCALL_VECTOR		0x80
-#ifdef CONFIG_X86_32
-# define SYSCALL_VECTOR			0x80
-#endif
 
 #define RESCHEDULE_VECTOR		0
 #define CALL_FUNCTION_VECTOR		1
@@ -48,7 +45,7 @@ static inline int invalid_vm86_irq(int i
  * static arrays.
  */
 
-#define NR_IRQS_LEGACY			  16
+#define NR_IRQS_LEGACY			16
 
 /*
  * The flat IRQ space is divided into two regions:
@@ -67,15 +64,17 @@ static inline int invalid_vm86_irq(int i
 #define IO_APIC_VECTOR_LIMIT		PIRQ_MAX(32 * MAX_IO_APICS)
 #define CPU_VECTOR_LIMIT		PIRQ_MAX(64 * NR_CPUS)
 
-#if defined(CONFIG_X86_IO_APIC)
-# define NR_PIRQS					\
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI)
+#define NR_PIRQS						\
 	(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ?	\
 		(NR_VECTORS + CPU_VECTOR_LIMIT)  :	\
 		(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
-#elif defined(CONFIG_XEN_PCIDEV_FRONTEND)
-# define NR_PIRQS			(NR_VECTORS + CPU_VECTOR_LIMIT)
-#else /* !CONFIG_X86_IO_APIC: */
-# define NR_PIRQS			NR_IRQS_LEGACY
+#elif defined(CONFIG_X86_IO_APIC)
+#define NR_PIRQS				(NR_VECTORS + IO_APIC_VECTOR_LIMIT)
+#elif defined(CONFIG_PCI_MSI)
+#define NR_PIRQS				(NR_VECTORS + CPU_VECTOR_LIMIT)
+#else
+#define NR_PIRQS				NR_IRQS_LEGACY
 #endif
 
 #ifndef __ASSEMBLY__
--- a/arch/x86/include/mach-xen/asm/mmu_context.h
+++ b/arch/x86/include/mach-xen/asm/mmu_context.h
@@ -30,7 +30,7 @@ extern struct static_key rdpmc_always_av
 
 static inline void load_mm_cr4(struct mm_struct *mm)
 {
-	if (static_key_true(&rdpmc_always_available) ||
+	if (static_key_false(&rdpmc_always_available) ||
 	    atomic_read(&mm->context.perf_rdpmc_allowed))
 		cr4_set_bits(X86_CR4_PCE);
 	else
@@ -41,6 +41,50 @@ static inline void load_mm_cr4(struct mm
 #endif
 
 /*
+ * ldt_structs can be allocated, used, and freed, but they are never
+ * modified while live.
+ */
+struct ldt_struct {
+	/*
+	 * Xen requires page-aligned LDTs with special permissions.  This is
+	 * needed to prevent us from installing evil descriptors such as
+	 * call gates.  On native, we could merge the ldt_struct and LDT
+	 * allocations, but it's not worth trying to optimize.
+	 */
+	struct desc_struct *entries;
+	int size;
+};
+
+static inline void load_mm_ldt(struct mm_struct *mm)
+{
+	struct ldt_struct *ldt;
+
+	/* lockless_dereference synchronizes with smp_store_release */
+	ldt = lockless_dereference(mm->context.ldt);
+
+	/*
+	 * Any change to mm->context.ldt is followed by an IPI to all
+	 * CPUs with the mm active.  The LDT will not be freed until
+	 * after the IPI is handled by all such CPUs.  This means that,
+	 * if the ldt_struct changes before we return, the values we see
+	 * will be safe, and the new values will be loaded before we run
+	 * any user code.
+	 *
+	 * NB: don't try to convert this to use RCU without extreme care.
+	 * We would still need IRQs off, because we don't want to change
+	 * the local LDT after an IPI loaded a newer value than the one
+	 * that we can see.
+	 */
+
+	if (unlikely(ldt))
+		set_ldt(ldt->entries, ldt->size);
+	else
+		clear_LDT();
+
+	DEBUG_LOCKS_WARN_ON(preemptible());
+}
+
+/*
  * Used for LDT copy/destruction.
  */
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
@@ -137,15 +181,24 @@ static inline void switch_mm(struct mm_s
 		 * was called and then modify_ldt changed
 		 * prev->context.ldt but suppressed an IPI to this CPU.
 		 * In this case, prev->context.ldt != NULL, because we
-		 * never free an LDT while the mm still exists.  That
-		 * means that next->context.ldt != prev->context.ldt,
-		 * because mms never share an LDT.
+		 * never set context.ldt to NULL while the mm still
+		 * exists.  That means that next->context.ldt !=
+		 * prev->context.ldt, because mms never share an LDT.
 		 */
 		if (unlikely(prev->context.ldt != next->context.ldt)) {
-			/* load_LDT_nolock(&next->context) */
+			/* load_mm_ldt(next) */
+			const struct ldt_struct *ldt;
+
+			/* lockless_dereference synchronizes with smp_store_release */
+			ldt = lockless_dereference(next->context.ldt);
 			op->cmd = MMUEXT_SET_LDT;
-			op->arg1.linear_addr = (unsigned long)next->context.ldt;
-			op->arg2.nr_ents     = next->context.size;
+			if (unlikely(ldt)) {
+				op->arg1.linear_addr = (long)ldt->entries;
+				op->arg2.nr_ents     = ldt->size;
+			} else {
+				op->arg1.linear_addr = 0;
+				op->arg2.nr_ents     = 0;
+			}
 			op++;
 		}
 
@@ -176,7 +229,7 @@ static inline void switch_mm(struct mm_s
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 			load_mm_cr4(next);
 			xen_new_user_pt(__pa(__user_pgd(next->pgd)));
-			load_LDT_nolock(&next->context);
+			load_mm_ldt(next);
 		}
 	}
 #endif
@@ -214,6 +267,19 @@ static inline void arch_exit_mmap(struct
 }
 #endif
 
+#ifdef CONFIG_X86_64
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+	return	!config_enabled(CONFIG_IA32_EMULATION) ||
+		!(mm->context.ia32_compat == TIF_IA32);
+}
+#else
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+	return false;
+}
+#endif
+
 static inline void arch_bprm_mm_init(struct mm_struct *mm,
 		struct vm_area_struct *vma)
 {
--- a/arch/x86/include/mach-xen/asm/pci.h
+++ b/arch/x86/include/mach-xen/asm/pci.h
@@ -5,7 +5,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/io.h>
 #include <asm/x86_init.h>
 
@@ -86,13 +86,6 @@ extern int pci_mmap_page_range(struct pc
 
 #ifdef CONFIG_PCI
 extern void early_quirks(void);
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
 #else
 static inline void early_quirks(void) { }
 #endif
@@ -102,15 +95,10 @@ extern void pci_iommu_alloc(void);
 #if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN)
 /* implemented in arch/x86/kernel/apic/io_apic. */
 struct msi_desc;
-void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq,
-			    unsigned int dest, struct msi_msg *msg, u8 hpet_id);
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void native_teardown_msi_irq(unsigned int irq);
 void native_restore_msi_irqs(struct pci_dev *dev);
-int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-		  unsigned int irq_base, unsigned int irq_offset);
 #else
-#define native_compose_msi_msg		NULL
 #define native_setup_msi_irqs		NULL
 #define native_teardown_msi_irq		NULL
 #endif
--- a/arch/x86/include/mach-xen/asm/pgtable.h
+++ b/arch/x86/include/mach-xen/asm/pgtable.h
@@ -398,11 +398,17 @@ static inline int is_new_memtype_allowed
 	 * requested memtype:
 	 * - request is uncached, return cannot be write-back
 	 * - request is write-combine, return cannot be write-back
+	 * - request is write-through, return cannot be write-back
+	 * - request is write-through, return cannot be write-combine
 	 */
 	if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
 	     new_pcm == _PAGE_CACHE_MODE_WB) ||
 	    (pcm == _PAGE_CACHE_MODE_WC &&
-	     new_pcm == _PAGE_CACHE_MODE_WB)) {
+	     new_pcm == _PAGE_CACHE_MODE_WB) ||
+	    (pcm == _PAGE_CACHE_MODE_WT &&
+	     new_pcm == _PAGE_CACHE_MODE_WB) ||
+	    (pcm == _PAGE_CACHE_MODE_WT &&
+	     new_pcm == _PAGE_CACHE_MODE_WC)) {
 		return 0;
 	}
 
@@ -840,9 +846,9 @@ static inline int pmd_write(pmd_t pmd)
 	return pmd_flags(pmd) & _PAGE_RW;
 }
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
 				       pmd_t *pmdp)
 {
 	pmd_t pmd = xen_pmdp_get_and_clear(pmdp);
--- a/arch/x86/include/mach-xen/asm/pgtable_types.h
+++ b/arch/x86/include/mach-xen/asm/pgtable_types.h
@@ -419,6 +419,9 @@ extern int nx_enabled;
 #define pgprot_writecombine	pgprot_writecombine
 extern pgprot_t pgprot_writecombine(pgprot_t prot);
 
+#define pgprot_writethrough	pgprot_writethrough
+extern pgprot_t pgprot_writethrough(pgprot_t prot);
+
 #ifndef CONFIG_XEN
 /* Indicate that x86 has its own track and untrack pfn vma functions */
 #define __HAVE_PFNMAP_TRACKING
--- a/arch/x86/include/mach-xen/asm/processor.h
+++ b/arch/x86/include/mach-xen/asm/processor.h
@@ -21,6 +21,7 @@ struct mm_struct;
 #include <asm/desc_defs.h>
 #include <asm/nops.h>
 #include <asm/special_insns.h>
+#include <asm/fpu/types.h>
 
 #include <linux/personality.h>
 #include <linux/cpumask.h>
@@ -54,11 +55,16 @@ static inline void *current_text_addr(vo
 	return pc;
 }
 
+/*
+ * These alignment constraints are for performance in the vSMP case,
+ * but in the task_struct case we must also meet hardware imposed
+ * alignment requirements of the FPU state:
+ */
 #ifdef CONFIG_X86_VSMP
 # define ARCH_MIN_TASKALIGN		(1 << INTERNODE_CACHE_SHIFT)
 # define ARCH_MIN_MMSTRUCT_ALIGN	(1 << INTERNODE_CACHE_SHIFT)
 #else
-# define ARCH_MIN_TASKALIGN		16
+# define ARCH_MIN_TASKALIGN		__alignof__(union fpregs_state)
 # define ARCH_MIN_MMSTRUCT_ALIGN	0
 #endif
 
@@ -177,7 +183,6 @@ extern const struct seq_operations cpuin
 #define cache_line_size()	(boot_cpu_data.x86_cache_alignment)
 
 extern void cpu_detect(struct cpuinfo_x86 *c);
-extern void fpu_detect(struct cpuinfo_x86 *c);
 
 extern void early_cpu_init(void);
 extern void identify_boot_cpu(void);
@@ -328,131 +333,9 @@ struct orig_ist {
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
 #else
-DECLARE_PER_CPU(unsigned long, kernel_stack);
+DECLARE_PER_CPU(unsigned long, cpu_sp0);
 #endif
 
-#define	MXCSR_DEFAULT		0x1f80
-
-struct i387_fsave_struct {
-	u32			cwd;	/* FPU Control Word		*/
-	u32			swd;	/* FPU Status Word		*/
-	u32			twd;	/* FPU Tag Word			*/
-	u32			fip;	/* FPU IP Offset		*/
-	u32			fcs;	/* FPU IP Selector		*/
-	u32			foo;	/* FPU Operand Pointer Offset	*/
-	u32			fos;	/* FPU Operand Pointer Selector	*/
-
-	/* 8*10 bytes for each FP-reg = 80 bytes:			*/
-	u32			st_space[20];
-
-	/* Software status information [not touched by FSAVE ]:		*/
-	u32			status;
-};
-
-struct i387_fxsave_struct {
-	u16			cwd; /* Control Word			*/
-	u16			swd; /* Status Word			*/
-	u16			twd; /* Tag Word			*/
-	u16			fop; /* Last Instruction Opcode		*/
-	union {
-		struct {
-			u64	rip; /* Instruction Pointer		*/
-			u64	rdp; /* Data Pointer			*/
-		};
-		struct {
-			u32	fip; /* FPU IP Offset			*/
-			u32	fcs; /* FPU IP Selector			*/
-			u32	foo; /* FPU Operand Offset		*/
-			u32	fos; /* FPU Operand Selector		*/
-		};
-	};
-	u32			mxcsr;		/* MXCSR Register State */
-	u32			mxcsr_mask;	/* MXCSR Mask		*/
-
-	/* 8*16 bytes for each FP-reg = 128 bytes:			*/
-	u32			st_space[32];
-
-	/* 16*16 bytes for each XMM-reg = 256 bytes:			*/
-	u32			xmm_space[64];
-
-	u32			padding[12];
-
-	union {
-		u32		padding1[12];
-		u32		sw_reserved[12];
-	};
-
-} __attribute__((aligned(16)));
-
-struct i387_soft_struct {
-	u32			cwd;
-	u32			swd;
-	u32			twd;
-	u32			fip;
-	u32			fcs;
-	u32			foo;
-	u32			fos;
-	/* 8*10 bytes for each FP-reg = 80 bytes: */
-	u32			st_space[20];
-	u8			ftop;
-	u8			changed;
-	u8			lookahead;
-	u8			no_update;
-	u8			rm;
-	u8			alimit;
-	struct math_emu_info	*info;
-	u32			entry_eip;
-};
-
-struct ymmh_struct {
-	/* 16 * 16 bytes for each YMMH-reg = 256 bytes */
-	u32 ymmh_space[64];
-};
-
-/* We don't support LWP yet: */
-struct lwp_struct {
-	u8 reserved[128];
-};
-
-struct bndreg {
-	u64 lower_bound;
-	u64 upper_bound;
-} __packed;
-
-struct bndcsr {
-	u64 bndcfgu;
-	u64 bndstatus;
-} __packed;
-
-struct xsave_hdr_struct {
-	u64 xstate_bv;
-	u64 xcomp_bv;
-	u64 reserved[6];
-} __attribute__((packed));
-
-struct xsave_struct {
-	struct i387_fxsave_struct i387;
-	struct xsave_hdr_struct xsave_hdr;
-	struct ymmh_struct ymmh;
-	struct lwp_struct lwp;
-	struct bndreg bndreg[4];
-	struct bndcsr bndcsr;
-	/* new processor state extensions will go here */
-} __attribute__ ((packed, aligned (64)));
-
-union thread_xstate {
-	struct i387_fsave_struct	fsave;
-	struct i387_fxsave_struct	fxsave;
-	struct i387_soft_struct		soft;
-	struct xsave_struct		xsave;
-};
-
-struct fpu {
-	unsigned int last_cpu;
-	unsigned int has_fpu;
-	union thread_xstate *state;
-};
-
 #ifdef CONFIG_X86_64
 #ifndef CONFIG_X86_NO_TSS
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
@@ -503,8 +386,6 @@ DECLARE_PER_CPU(struct irq_stack *, soft
 #endif	/* X86_64 */
 
 extern unsigned int xstate_size;
-extern void free_thread_xstate(struct task_struct *);
-extern struct kmem_cache *task_xstate_cachep;
 
 struct perf_event;
 
@@ -528,6 +409,7 @@ struct thread_struct {
 	unsigned long		fs;
 #endif
 	unsigned long		gs;
+
 	/* Save middle states of ptrace breakpoints */
 	struct perf_event	*ptrace_bps[HBP_NUM];
 	/* Debug status used for traps, single steps, etc... */
@@ -538,8 +420,6 @@ struct thread_struct {
 	unsigned long		cr2;
 	unsigned long		trap_nr;
 	unsigned long		error_code;
-	/* floating point and extended processor state */
-	struct fpu		fpu;
 #ifdef CONFIG_X86_32
 	/* Virtual 86 mode info */
 	struct vm86_struct __user *vm86_info;
@@ -555,15 +435,13 @@ struct thread_struct {
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
+
+	/* Floating point and extended processor state */
+	struct fpu		fpu;
 	/*
-	 * fpu_counter contains the number of consecutive context switches
-	 * that the FPU is used. If this is over a threshold, the lazy fpu
-	 * saving becomes unlazy to save the trap. This is an unsigned char
-	 * so that after 256 times the counter wraps and the behavior turns
-	 * lazy again; this to deal with bursty apps that only use FPU for
-	 * a short time
+	 * WARNING: 'fpu' is dynamically-sized.  It *MUST* be at
+	 * the end.
 	 */
-	unsigned char fpu_counter;
 };
 
 /*
@@ -601,7 +479,7 @@ native_load_sp0(struct tss_struct *tss,
 static inline unsigned long current_top_of_stack(void)
 {
 #ifdef CONFIG_X86_64
-	return this_cpu_read_stable(kernel_stack);
+	return this_cpu_read_stable(cpu_sp0);
 #else
 	/* sp0 on x86_32 is special in and around vm86 mode. */
 	return this_cpu_read_stable(cpu_current_top_of_stack);
@@ -931,24 +809,25 @@ extern int get_tsc_mode(unsigned long ad
 extern int set_tsc_mode(unsigned int val);
 
 /* Register/unregister a process' MPX related resource */
-#define MPX_ENABLE_MANAGEMENT(tsk)	mpx_enable_management((tsk))
-#define MPX_DISABLE_MANAGEMENT(tsk)	mpx_disable_management((tsk))
+#define MPX_ENABLE_MANAGEMENT()	mpx_enable_management()
+#define MPX_DISABLE_MANAGEMENT()	mpx_disable_management()
 
 #ifdef CONFIG_X86_INTEL_MPX
-extern int mpx_enable_management(struct task_struct *tsk);
-extern int mpx_disable_management(struct task_struct *tsk);
+extern int mpx_enable_management(void);
+extern int mpx_disable_management(void);
 #else
-static inline int mpx_enable_management(struct task_struct *tsk)
+static inline int mpx_enable_management(void)
 {
 	return -EINVAL;
 }
-static inline int mpx_disable_management(struct task_struct *tsk)
+static inline int mpx_disable_management(void)
 {
 	return -EINVAL;
 }
 #endif /* CONFIG_X86_INTEL_MPX */
 
 extern u16 amd_get_nb_id(int cpu);
+extern u32 amd_get_nodes_per_socket(void);
 
 static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
 {
--- a/arch/x86/include/mach-xen/asm/smp.h
+++ b/arch/x86/include/mach-xen/asm/smp.h
@@ -36,19 +36,7 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
 DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
-#endif
 
-static inline const struct cpumask *cpu_sibling_mask(int cpu)
-{
-	return cpumask_of(cpu);
-}
-
-static inline const struct cpumask *cpu_core_mask(int cpu)
-{
-	return cpumask_of(cpu);
-}
-
-#ifndef CONFIG_XEN
 static inline struct cpumask *cpu_llc_shared_mask(int cpu)
 {
 	return per_cpu(cpu_llc_shared_map, cpu);
--- a/arch/x86/include/mach-xen/asm/special_insns.h
+++ b/arch/x86/include/mach-xen/asm/special_insns.h
@@ -262,6 +262,44 @@ static inline void clwb(volatile void *_
 		: [pax] "a" (p));
 }
 
+/**
+ * pcommit_sfence() - persistent commit and fence
+ *
+ * The PCOMMIT instruction ensures that data that has been flushed from the
+ * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to
+ * memory and is durable on the DIMM.  The primary use case for this is
+ * persistent memory.
+ *
+ * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT
+ * with appropriate fencing.
+ *
+ * Example:
+ * void flush_and_commit_buffer(void *vaddr, unsigned int size)
+ * {
+ *         unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ *         void *vend = vaddr + size;
+ *         void *p;
+ *
+ *         for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+ *              p < vend; p += boot_cpu_data.x86_clflush_size)
+ *                 clwb(p);
+ *
+ *         // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes
+ *         // MFENCE via mb() also works
+ *         wmb();
+ *
+ *         // PCOMMIT and the required SFENCE for ordering
+ *         pcommit_sfence();
+ * }
+ *
+ * After this function completes the data pointed to by 'vaddr' has been
+ * accepted to memory and will be durable if the 'vaddr' points to persistent
+ * memory.
+ *
+ * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify
+ * things we include both the PCOMMIT and the required SFENCE in the
+ * alternatives generated by pcommit_sfence().
+ */
 static inline void pcommit_sfence(void)
 {
 	alternative(ASM_NOP7,
--- a/arch/x86/include/mach-xen/asm/spinlock.h
+++ b/arch/x86/include/mach-xen/asm/spinlock.h
@@ -37,6 +37,10 @@
 # define UNLOCK_LOCK_PREFIX
 #endif
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#else
+
 #ifdef TICKET_SHIFT
 
 /* How long a lock should spin before we consider blocking */
@@ -44,8 +48,6 @@
 
 #include <asm/irqflags.h>
 
-int xen_spinlock_init(unsigned int cpu);
-void xen_spinlock_cleanup(unsigned int cpu);
 #if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
 struct __raw_tickets xen_spin_adjust(const arch_spinlock_t *,
 				     struct __raw_tickets);
@@ -191,9 +193,6 @@ static inline void __ticket_spin_unlock_
 
 #else /* TICKET_SHIFT */
 
-static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
-static inline void xen_spinlock_cleanup(unsigned int cpu) {}
-
 static __always_inline int __byte_spin_value_unlocked(arch_spinlock_t lock)
 {
 	return lock.lock == 0;
@@ -297,6 +296,15 @@ static __always_inline void arch_spin_un
 }
 
 #undef __arch_spin
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+#if !defined(CONFIG_QUEUED_SPINLOCKS) && defined(TICKET_SHIFT)
+int xen_spinlock_init(unsigned int cpu);
+void xen_spinlock_cleanup(unsigned int cpu);
+#else
+static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
+static inline void xen_spinlock_cleanup(unsigned int cpu) {}
+#endif
 
 /*
  * Read-write spinlocks, allowing multiple readers
--- a/arch/x86/include/mach-xen/asm/spinlock_types.h
+++ b/arch/x86/include/mach-xen/asm/spinlock_types.h
@@ -3,6 +3,9 @@
 
 #include <linux/types.h>
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#else
 #ifdef CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
 #define __TICKET_LOCK_INC	1
 #define TICKET_SLOWPATH_FLAG	((__ticket_t)0)
@@ -49,6 +52,7 @@ typedef struct {
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 #include <asm-generic/qrwlock_types.h>
 
--- a/arch/x86/kernel/apic/io_apic-xen.c
+++ b/arch/x86/kernel/apic/io_apic-xen.c
@@ -18,6 +18,16 @@
  *					and Rolf G. Tews
  *					for testing these extensively
  *	Paul Diefenbaugh	:	Added full ACPI support
+ *
+ * Historical information which is worth to be preserved:
+ *
+ * - SiS APIC rmw bug:
+ *
+ *	We used to have a workaround for a bug in SiS chips which
+ *	required to rewrite the index register for a read-modify-write
+ *	operation as the chip lost the index information which was
+ *	setup for the read already. We cache the data now, so that
+ *	workaround has been removed.
  */
 
 #include <linux/mm.h>
@@ -31,13 +41,13 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/syscore_ops.h>
-#include <linux/irqdomain.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/jiffies.h>	/* time_after() */
 #include <linux/slab.h>
 #include <linux/bootmem.h>
 
+#include <asm/irqdomain.h>
 #include <asm/idle.h>
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -84,27 +94,33 @@ unsigned long io_apic_irqs;
 #define	for_each_ioapic_pin(idx, pin)	\
 	for_each_ioapic((idx))		\
 		for_each_pin((idx), (pin))
-
 #define for_each_irq_pin(entry, head) \
 	list_for_each_entry(entry, &head, list)
 
-/*
- *      Is the SiS APIC rmw bug present ?
- *      -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
 static DEFINE_RAW_SPINLOCK(ioapic_lock);
 static DEFINE_MUTEX(ioapic_mutex);
 static unsigned int ioapic_dynirq_base;
 static int ioapic_initialized;
 
-struct mp_pin_info {
+struct irq_pin_list {
+	struct list_head list;
+	int apic, pin;
+};
+
+struct mp_chip_data {
+#ifndef CONFIG_XEN
+	struct list_head irq_2_pin;
+#endif
+	struct IO_APIC_route_entry entry;
 	int trigger;
 	int polarity;
-	int node;
-	int set;
 	u32 count;
+	bool isa_irq;
+};
+
+struct mp_ioapic_gsi {
+	u32 gsi_base;
+	u32 gsi_end;
 };
 
 static struct ioapic {
@@ -124,7 +140,6 @@ static struct ioapic {
 	struct mp_ioapic_gsi  gsi_config;
 	struct ioapic_domain_cfg irqdomain_cfg;
 	struct irq_domain *irqdomain;
-	struct mp_pin_info *pin_info;
 	struct resource *iomem_res;
 } ioapics[MAX_IO_APICS];
 
@@ -140,7 +155,7 @@ unsigned int mpc_ioapic_addr(int ioapic_
 	return ioapics[ioapic_idx].mp_config.apicaddr;
 }
 
-struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
 {
 	return &ioapics[ioapic_idx].gsi_config;
 }
@@ -152,11 +167,16 @@ static inline int mp_ioapic_pin_count(in
 	return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
 }
 
-u32 mp_pin_to_gsi(int ioapic, int pin)
+static inline u32 mp_pin_to_gsi(int ioapic, int pin)
 {
 	return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
 }
 
+static inline bool mp_is_legacy_irq(int irq)
+{
+	return irq >= 0 && irq < nr_legacy_irqs();
+}
+
 /*
  * Initialize all legacy IRQs and all pins on the first IOAPIC
  * if we have legacy interrupt controller. Kernel boot option "pirq="
@@ -167,12 +187,7 @@ static inline int mp_init_irq_at_boot(in
 	if (!nr_legacy_irqs())
 		return 0;
 
-	return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs());
-}
-
-static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin)
-{
-	return ioapics[ioapic_idx].pin_info + pin;
+	return ioapic == 0 || mp_is_legacy_irq(irq);
 }
 
 static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
@@ -240,16 +255,6 @@ void mp_save_irq(struct mpc_intsrc *m)
 }
 
 #ifndef CONFIG_XEN
-struct irq_pin_list {
-	struct list_head list;
-	int apic, pin;
-};
-
-static struct irq_pin_list *alloc_irq_pin_list(int node)
-{
-	return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
-}
-
 static void alloc_ioapic_saved_registers(int idx)
 {
 	size_t size;
@@ -271,8 +276,7 @@ static void free_ioapic_saved_registers(
 
 int __init arch_early_ioapic_init(void)
 {
-	struct irq_cfg *cfg;
-	int i, node = cpu_to_node(0);
+	int i;
 
 	if (!nr_legacy_irqs())
 		io_apic_irqs = ~0UL;
@@ -280,16 +284,6 @@ int __init arch_early_ioapic_init(void)
 	for_each_ioapic(i)
 		alloc_ioapic_saved_registers(i);
 
-	/*
-	 * For legacy IRQ's, start with assigning irq0 to irq15 to
-	 * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
-	 */
-	for (i = 0; i < nr_legacy_irqs(); i++) {
-		cfg = alloc_irq_and_cfg_at(i, node);
-		cfg->vector = IRQ0_VECTOR + i;
-		cpumask_setall(cfg->domain);
-	}
-
 	return 0;
 }
 
@@ -307,7 +301,7 @@ static __attribute_const__ struct io_api
 		+ (mpc_ioapic_addr(idx) & ~PAGE_MASK);
 }
 
-void io_apic_eoi(unsigned int apic, unsigned int vector)
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 	writel(vector, &io_apic->eoi);
@@ -320,28 +314,14 @@ unsigned int native_io_apic_read(unsigne
 	return readl(&io_apic->data);
 }
 
-void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+static void io_apic_write(unsigned int apic, unsigned int reg,
+			  unsigned int value)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 
 	writel(reg, &io_apic->index);
 	writel(value, &io_apic->data);
 }
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	struct io_apic __iomem *io_apic = io_apic_base(apic);
-
-	if (sis_apic_bug)
-		writel(reg, &io_apic->index);
-	writel(value, &io_apic->data);
-}
 #else /* !CONFIG_XEN */
 #define alloc_ioapic_saved_registers(idx)
 #define free_ioapic_saved_registers(idx)
@@ -368,8 +348,6 @@ static inline void io_apic_write(unsigne
 	apic_op.value = value;
 	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
 }
-
-#define io_apic_modify io_apic_write
 #endif /* !CONFIG_XEN */
 
 union entry_union {
@@ -429,7 +407,6 @@ static void ioapic_write_entry(int apic,
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-#ifndef CONFIG_XEN
 /*
  * When we mask an IO APIC routing entry, we need to write the low
  * word first, in order to set the mask bit before we change the
@@ -438,29 +415,33 @@ static void ioapic_write_entry(int apic,
 static void ioapic_mask_entry(int apic, int pin)
 {
 	unsigned long flags;
-	union entry_union eu = { .entry.mask = 1 };
+	union entry_union eu = { .entry.mask = IOAPIC_MASKED };
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+#ifndef CONFIG_XEN
 	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+#endif
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
+#ifndef CONFIG_XEN
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static int __add_pin_to_irq_node(struct mp_chip_data *data,
+				 int node, int apic, int pin)
 {
 	struct irq_pin_list *entry;
 
 	/* don't allow duplicates */
-	for_each_irq_pin(entry, cfg->irq_2_pin)
+	for_each_irq_pin(entry, data->irq_2_pin)
 		if (entry->apic == apic && entry->pin == pin)
 			return 0;
 
-	entry = alloc_irq_pin_list(node);
+	entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
 	if (!entry) {
 		pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
 		       node, apic, pin);
@@ -468,16 +449,16 @@ static int __add_pin_to_irq_node(struct
 	}
 	entry->apic = apic;
 	entry->pin = pin;
+	list_add_tail(&entry->list, &data->irq_2_pin);
 
-	list_add_tail(&entry->list, &cfg->irq_2_pin);
 	return 0;
 }
 
-static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
+static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
 {
 	struct irq_pin_list *tmp, *entry;
 
-	list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list)
+	list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list)
 		if (entry->apic == apic && entry->pin == pin) {
 			list_del(&entry->list);
 			kfree(entry);
@@ -485,22 +466,23 @@ static void __remove_pin_from_irq(struct
 		}
 }
 
-static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static void add_pin_to_irq_node(struct mp_chip_data *data,
+				int node, int apic, int pin)
 {
-	if (__add_pin_to_irq_node(cfg, node, apic, pin))
+	if (__add_pin_to_irq_node(data, node, apic, pin))
 		panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
 }
 
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
 					   int oldapic, int oldpin,
 					   int newapic, int newpin)
 {
 	struct irq_pin_list *entry;
 
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
+	for_each_irq_pin(entry, data->irq_2_pin) {
 		if (entry->apic == oldapic && entry->pin == oldpin) {
 			entry->apic = newapic;
 			entry->pin = newpin;
@@ -510,32 +492,26 @@ static void __init replace_pin_at_irq_no
 	}
 
 	/* old apic/pin didn't exist, so just add new ones */
-	add_pin_to_irq_node(cfg, node, newapic, newpin);
+	add_pin_to_irq_node(data, node, newapic, newpin);
 }
 
-static void __io_apic_modify_irq(struct irq_pin_list *entry,
-				 int mask_and, int mask_or,
-				 void (*final)(struct irq_pin_list *entry))
-{
-	unsigned int reg, pin;
-
-	pin = entry->pin;
-	reg = io_apic_read(entry->apic, 0x10 + pin * 2);
-	reg &= mask_and;
-	reg |= mask_or;
-	io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
-	if (final)
-		final(entry);
-}
-
-static void io_apic_modify_irq(struct irq_cfg *cfg,
+static void io_apic_modify_irq(struct mp_chip_data *data,
 			       int mask_and, int mask_or,
 			       void (*final)(struct irq_pin_list *entry))
 {
+	union entry_union eu;
 	struct irq_pin_list *entry;
 
-	for_each_irq_pin(entry, cfg->irq_2_pin)
-		__io_apic_modify_irq(entry, mask_and, mask_or, final);
+	eu.entry = data->entry;
+	eu.w1 &= mask_and;
+	eu.w1 |= mask_or;
+	data->entry = eu.entry;
+
+	for_each_irq_pin(entry, data->irq_2_pin) {
+		io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+		if (final)
+			final(entry);
+	}
 }
 
 static void io_apic_sync(struct irq_pin_list *entry)
@@ -550,39 +526,31 @@ static void io_apic_sync(struct irq_pin_
 	readl(&io_apic->data);
 }
 
-static void mask_ioapic(struct irq_cfg *cfg)
+static void mask_ioapic_irq(struct irq_data *irq_data)
 {
+	struct mp_chip_data *data = irq_data->chip_data;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void mask_ioapic_irq(struct irq_data *data)
+static void __unmask_ioapic(struct mp_chip_data *data)
 {
-	mask_ioapic(irqd_cfg(data));
+	io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
-static void __unmask_ioapic(struct irq_cfg *cfg)
-{
-	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
-static void unmask_ioapic(struct irq_cfg *cfg)
+static void unmask_ioapic_irq(struct irq_data *irq_data)
 {
+	struct mp_chip_data *data = irq_data->chip_data;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_ioapic(cfg);
+	__unmask_ioapic(data);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_ioapic_irq(struct irq_data *data)
-{
-	unmask_ioapic(irqd_cfg(data));
-}
-
 /*
  * IO-APIC versions below 0x20 don't support EOI register.
  * For the record, here is the information about various versions:
@@ -599,7 +567,7 @@ static void unmask_ioapic_irq(struct irq
  * Otherwise, we simulate the EOI message manually by changing the trigger
  * mode to edge and then back to level, with RTE being masked during this.
  */
-void native_eoi_ioapic_pin(int apic, int pin, int vector)
+static void __eoi_ioapic_pin(int apic, int pin, int vector)
 {
 	if (mpc_ioapic_ver(apic) >= 0x20) {
 		io_apic_eoi(apic, vector);
@@ -611,7 +579,7 @@ void native_eoi_ioapic_pin(int apic, int
 		/*
 		 * Mask the entry and change the trigger mode to edge.
 		 */
-		entry1.mask = 1;
+		entry1.mask = IOAPIC_MASKED;
 		entry1.trigger = IOAPIC_EDGE;
 
 		__ioapic_write_entry(apic, pin, entry1);
@@ -623,15 +591,14 @@ void native_eoi_ioapic_pin(int apic, int
 	}
 }
 
-void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
 {
-	struct irq_pin_list *entry;
 	unsigned long flags;
+	struct irq_pin_list *entry;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	for_each_irq_pin(entry, cfg->irq_2_pin)
-		x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin,
-					       cfg->vector);
+	for_each_irq_pin(entry, data->irq_2_pin)
+		__eoi_ioapic_pin(entry->apic, entry->pin, vector);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -648,8 +615,8 @@ static void clear_IO_APIC_pin(unsigned i
 	 * Make sure the entry is masked and re-read the contents to check
 	 * if it is a level triggered pin and if the remote-IRR is set.
 	 */
-	if (!entry.mask) {
-		entry.mask = 1;
+	if (entry.mask == IOAPIC_UNMASKED) {
+		entry.mask = IOAPIC_MASKED;
 		ioapic_write_entry(apic, pin, entry);
 		entry = ioapic_read_entry(apic, pin);
 	}
@@ -662,13 +629,12 @@ static void clear_IO_APIC_pin(unsigned i
 		 * doesn't clear the remote-IRR if the trigger mode is not
 		 * set to level.
 		 */
-		if (!entry.trigger) {
+		if (entry.trigger == IOAPIC_EDGE) {
 			entry.trigger = IOAPIC_LEVEL;
 			ioapic_write_entry(apic, pin, entry);
 		}
-
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector);
+		__eoi_ioapic_pin(apic, pin, entry.vector);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -771,8 +737,8 @@ void mask_ioapic_entries(void)
 			struct IO_APIC_route_entry entry;
 
 			entry = ioapics[apic].saved_registers[pin];
-			if (!entry.mask) {
-				entry.mask = 1;
+			if (entry.mask == IOAPIC_UNMASKED) {
+				entry.mask = IOAPIC_MASKED;
 				ioapic_write_entry(apic, pin, entry);
 			}
 		}
@@ -877,11 +843,11 @@ static int EISA_ELCR(unsigned int irq)
 
 #endif
 
-/* ISA interrupts are always polarity zero edge triggered,
+/* ISA interrupts are always active high edge triggered,
  * when listed as conforming in the MP table. */
 
-#define default_ISA_trigger(idx)	(0)
-#define default_ISA_polarity(idx)	(0)
+#define default_ISA_trigger(idx)	(IOAPIC_EDGE)
+#define default_ISA_polarity(idx)	(IOAPIC_POL_HIGH)
 
 /* EISA interrupts are always polarity zero and can be edge or level
  * trigger depending on the ELCR value.  If an interrupt is listed as
@@ -891,53 +857,55 @@ static int EISA_ELCR(unsigned int irq)
 #define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].srcbusirq))
 #define default_EISA_polarity(idx)	default_ISA_polarity(idx)
 
-/* PCI interrupts are always polarity one level triggered,
+/* PCI interrupts are always active low level triggered,
  * when listed as conforming in the MP table. */
 
-#define default_PCI_trigger(idx)	(1)
-#define default_PCI_polarity(idx)	(1)
+#define default_PCI_trigger(idx)	(IOAPIC_LEVEL)
+#define default_PCI_polarity(idx)	(IOAPIC_POL_LOW)
 
 static int irq_polarity(int idx)
 {
 	int bus = mp_irqs[idx].srcbus;
-	int polarity;
 
 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].irqflag & 3)
-	{
-		case 0: /* conforms, ie. bus-type dependent polarity */
-			if (test_bit(bus, mp_bus_not_pci))
-				polarity = default_ISA_polarity(idx);
-			else
-				polarity = default_PCI_polarity(idx);
-			break;
-		case 1: /* high active */
-		{
-			polarity = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			pr_warn("broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
-		case 3: /* low active */
-		{
-			polarity = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			pr_warn("broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
+	switch (mp_irqs[idx].irqflag & 0x03) {
+	case 0:
+		/* conforms to spec, ie. bus-type dependent polarity */
+		if (test_bit(bus, mp_bus_not_pci))
+			return default_ISA_polarity(idx);
+		else
+			return default_PCI_polarity(idx);
+	case 1:
+		return IOAPIC_POL_HIGH;
+	case 2:
+		pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+	case 3:
+	default: /* Pointless default required due to do gcc stupidity */
+		return IOAPIC_POL_LOW;
+	}
+}
+
+#ifdef CONFIG_EISA
+static int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+	switch (mp_bus_id_to_type[bus]) {
+	case MP_BUS_PCI:
+	case MP_BUS_ISA:
+		return trigger;
+	case MP_BUS_EISA:
+		return default_EISA_trigger(idx);
 	}
-	return polarity;
+	pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
+	return IOAPIC_LEVEL;
+}
+#else
+static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+	return trigger;
 }
+#endif
 
 static int irq_trigger(int idx)
 {
@@ -947,153 +915,229 @@ static int irq_trigger(int idx)
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].irqflag>>2) & 3)
-	{
-		case 0: /* conforms, ie. bus-type dependent */
-			if (test_bit(bus, mp_bus_not_pci))
-				trigger = default_ISA_trigger(idx);
-			else
-				trigger = default_PCI_trigger(idx);
-#ifdef CONFIG_EISA
-			switch (mp_bus_id_to_type[bus]) {
-				case MP_BUS_ISA: /* ISA pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				case MP_BUS_EISA: /* EISA pin */
-				{
-					trigger = default_EISA_trigger(idx);
-					break;
-				}
-				case MP_BUS_PCI: /* PCI pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				default:
-				{
-					pr_warn("broken BIOS!!\n");
-					trigger = 1;
-					break;
-				}
-			}
-#endif
-			break;
-		case 1: /* edge */
-		{
-			trigger = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			pr_warn("broken BIOS!!\n");
-			trigger = 1;
-			break;
-		}
-		case 3: /* level */
-		{
-			trigger = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			pr_warn("broken BIOS!!\n");
-			trigger = 0;
-			break;
+	switch ((mp_irqs[idx].irqflag >> 2) & 0x03) {
+	case 0:
+		/* conforms to spec, ie. bus-type dependent trigger mode */
+		if (test_bit(bus, mp_bus_not_pci))
+			trigger = default_ISA_trigger(idx);
+		else
+			trigger = default_PCI_trigger(idx);
+		/* Take EISA into account */
+		return eisa_irq_trigger(idx, bus, trigger);
+	case 1:
+		return IOAPIC_EDGE;
+	case 2:
+		pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+	case 3:
+	default: /* Pointless default required due to do gcc stupidity */
+		return IOAPIC_LEVEL;
+	}
+}
+
+void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
+			   int trigger, int polarity)
+{
+	init_irq_alloc_info(info, NULL);
+	info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	info->ioapic_node = node;
+	info->ioapic_trigger = trigger;
+	info->ioapic_polarity = polarity;
+	info->ioapic_valid = 1;
+}
+
+#ifndef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
+#endif
+
+static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
+				   struct irq_alloc_info *src,
+				   u32 gsi, int ioapic_idx, int pin)
+{
+	int trigger, polarity;
+
+	copy_irq_alloc_info(dst, src);
+	dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	dst->ioapic_id = mpc_ioapic_id(ioapic_idx);
+	dst->ioapic_pin = pin;
+	dst->ioapic_valid = 1;
+	if (src && src->ioapic_valid) {
+		dst->ioapic_node = src->ioapic_node;
+		dst->ioapic_trigger = src->ioapic_trigger;
+		dst->ioapic_polarity = src->ioapic_polarity;
+	} else {
+		dst->ioapic_node = NUMA_NO_NODE;
+		if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
+			dst->ioapic_trigger = trigger;
+			dst->ioapic_polarity = polarity;
+		} else {
+			/*
+			 * PCI interrupts are always active low level
+			 * triggered.
+			 */
+			dst->ioapic_trigger = IOAPIC_LEVEL;
+			dst->ioapic_polarity = IOAPIC_POL_LOW;
 		}
 	}
-	return trigger;
 }
 
-static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin)
+static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
+{
+	return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE;
+}
+
+static void mp_register_handler(unsigned int irq, unsigned long trigger)
+{
+#ifndef CONFIG_XEN
+	irq_flow_handler_t hdl;
+	bool fasteoi;
+
+	if (trigger) {
+		irq_set_status_flags(irq, IRQ_LEVEL);
+		fasteoi = true;
+	} else {
+		irq_clear_status_flags(irq, IRQ_LEVEL);
+		fasteoi = false;
+	}
+
+	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+	__irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge");
+#endif
+}
+
+static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
 {
+	struct mp_chip_data *data = irq_get_chip_data(irq);
+
+	/*
+	 * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
+	 * and polarity attirbutes. So allow the first user to reprogram the
+	 * pin with real trigger and polarity attributes.
+	 */
+	if (irq < nr_legacy_irqs() && data->count == 1) {
+		if (info->ioapic_trigger != data->trigger)
+			mp_register_handler(irq, info->ioapic_trigger);
+		data->entry.trigger = data->trigger = info->ioapic_trigger;
+		data->entry.polarity = data->polarity = info->ioapic_polarity;
+	}
+
+	return data->trigger == info->ioapic_trigger &&
+	       data->polarity == info->ioapic_polarity;
+}
+
+static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
+				 struct irq_alloc_info *info)
+{
+	bool legacy = false;
 	int irq = -1;
-	int ioapic = (int)(long)domain->host_data;
 	int type = ioapics[ioapic].irqdomain_cfg.type;
 
 	switch (type) {
 	case IOAPIC_DOMAIN_LEGACY:
 		/*
-		 * Dynamically allocate IRQ number for non-ISA IRQs in the first 16
-		 * GSIs on some weird platforms.
+		 * Dynamically allocate IRQ number for non-ISA IRQs in the first
+		 * 16 GSIs on some weird platforms.
 		 */
-		if (gsi < nr_legacy_irqs())
-			irq = irq_create_mapping(domain, pin);
-		else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+		if (!ioapic_initialized || gsi >= nr_legacy_irqs())
 			irq = gsi;
+		legacy = mp_is_legacy_irq(irq);
 		break;
 	case IOAPIC_DOMAIN_STRICT:
-		if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
-			irq = gsi;
+		irq = gsi;
 		break;
 	case IOAPIC_DOMAIN_DYNAMIC:
-		irq = irq_create_mapping(domain, pin);
 		break;
 	default:
 		WARN(1, "ioapic: unknown irqdomain type %d\n", type);
-		break;
+		return -1;
 	}
 
-	return irq > 0 ? irq : -1;
+	return __irq_domain_alloc_irqs(domain, irq, 1,
+				       ioapic_alloc_attr_node(info),
+				       info, legacy);
+}
+
+/*
+ * Need special handling for ISA IRQs because there may be multiple IOAPIC pins
+ * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping
+ * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are
+ * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and
+ * some BIOSes may use MP Interrupt Source records to override IRQ numbers for
+ * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
+ * multiple pins sharing the same legacy IRQ number when ACPI is disabled.
+ */
+static int alloc_isa_irq_from_domain(struct irq_domain *domain,
+				     int irq, int ioapic, int pin,
+				     struct irq_alloc_info *info)
+{
+	struct mp_chip_data *data;
+	struct irq_data *irq_data = irq_get_irq_data(irq);
+	int node = ioapic_alloc_attr_node(info);
+
+	/*
+	 * Legacy ISA IRQ has already been allocated, just add pin to
+	 * the pin list assoicated with this IRQ and program the IOAPIC
+	 * entry. The IOAPIC entry
+	 */
+	if (irq_data && irq_data->parent_data) {
+		if (!mp_check_pin_attr(irq, info))
+			return -EBUSY;
+		if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
+					  info->ioapic_pin))
+			return -ENOMEM;
+	} else {
+		irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true);
+		if (irq >= 0) {
+			irq_data = irq_domain_get_irq_data(domain, irq);
+			data = irq_data->chip_data;
+			data->isa_irq = true;
+		}
+	}
+
+	return irq;
 }
 
 static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
-			     unsigned int flags)
+			     unsigned int flags, struct irq_alloc_info *info)
 {
-	int irq;
+	int irq = -ENOENT;
+	bool legacy = false;
+	struct irq_alloc_info tmp;
+	struct mp_chip_data *data;
 	struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
-	struct mp_pin_info *info = mp_pin_info(ioapic, pin);
 
 	if (!domain)
-		return -1;
-
-	mutex_lock(&ioapic_mutex);
+		return -ENOSYS;
 
-	/*
-	 * Don't use irqdomain to manage ISA IRQs because there may be
-	 * multiple IOAPIC pins sharing the same ISA IRQ number and
-	 * irqdomain only supports 1:1 mapping between IOAPIC pin and
-	 * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used
-	 * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
-	 * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are
-	 * available, and some BIOSes may use MP Interrupt Source records
-	 * to override IRQ numbers for PIRQs instead of reprogramming
-	 * the interrupt routing logic. Thus there may be multiple pins
-	 * sharing the same legacy IRQ number when ACPI is disabled.
-	 */
 	if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
 		irq = mp_irqs[idx].srcbusirq;
-		if (flags & IOAPIC_MAP_ALLOC) {
-			if (info->count == 0 &&
-			    mp_irqdomain_map(domain, irq, pin) != 0)
-				irq = -1;
+		legacy = mp_is_legacy_irq(irq);
+	}
 
-			/* special handling for timer IRQ0 */
+	mutex_lock(&ioapic_mutex);
+	if (!(flags & IOAPIC_MAP_ALLOC)) {
+		if (!legacy) {
+			irq = irq_find_mapping(domain, pin);
 			if (irq == 0)
-				info->count++;
+				irq = -ENOENT;
 		}
 	} else {
-		irq = irq_find_mapping(domain, pin);
-		if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC))
-			irq = alloc_irq_from_domain(domain, gsi, pin);
-	}
-
-	if (flags & IOAPIC_MAP_ALLOC) {
-		/* special handling for legacy IRQs */
-		if (irq < nr_legacy_irqs() && info->count == 1 &&
-		    mp_irqdomain_map(domain, irq, pin) != 0)
-			irq = -1;
-
-		if (irq > 0)
-			info->count++;
-		else if (info->count == 0)
-			info->set = 0;
+		ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin);
+		if (legacy)
+			irq = alloc_isa_irq_from_domain(domain, irq,
+							ioapic, pin, &tmp);
+		else if ((irq = irq_find_mapping(domain, pin)) == 0)
+			irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp);
+		else if (!mp_check_pin_attr(irq, &tmp))
+			irq = -EBUSY;
+		if (irq >= 0) {
+			data = irq_get_chip_data(irq);
+			data->count++;
+		}
 	}
-
 	mutex_unlock(&ioapic_mutex);
 
-	return irq > 0 ? irq : -1;
+	return irq;
 }
 
 static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
@@ -1126,10 +1170,10 @@ static int pin_2_irq(int idx, int ioapic
 	}
 #endif
 
-	return  mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+	return  mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL);
 }
 
-int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
 {
 	int ioapic, pin, idx;
 
@@ -1142,31 +1186,24 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned
 	if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
 		return -1;
 
-	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
 }
 
 void mp_unmap_irq(int irq)
 {
-	struct irq_data *data = irq_get_irq_data(irq);
-	struct mp_pin_info *info;
-	int ioapic, pin;
+	struct irq_data *irq_data = irq_get_irq_data(irq);
+	struct mp_chip_data *data;
 
-	if (!data || !data->domain)
+	if (!irq_data || !irq_data->domain)
 		return;
 
-	ioapic = (int)(long)data->domain->host_data;
-	pin = (int)data->hwirq;
-	info = mp_pin_info(ioapic, pin);
+	data = irq_data->chip_data;
+	if (!data || data->isa_irq)
+		return;
 
 	mutex_lock(&ioapic_mutex);
-	if (--info->count == 0) {
-		info->set = 0;
-		if (irq < nr_legacy_irqs() &&
-		    ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY)
-			mp_irqdomain_unmap(data->domain, irq);
-		else
-			irq_dispose_mapping(irq);
-	}
+	if (--data->count == 0)
+		irq_domain_free_irqs(irq, 1);
 	mutex_unlock(&ioapic_mutex);
 }
 
@@ -1234,7 +1271,7 @@ out:
 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
 
 #ifndef CONFIG_XEN
-static struct irq_chip ioapic_chip;
+static struct irq_chip ioapic_chip, ioapic_ir_chip;
 
 #ifdef CONFIG_X86_32
 static inline int IO_APIC_irq_trigger(int irq)
@@ -1257,114 +1294,7 @@ static inline int IO_APIC_irq_trigger(in
 	return 1;
 }
 #endif
-
-static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
-				 unsigned long trigger)
-{
-	struct irq_chip *chip = &ioapic_chip;
-	irq_flow_handler_t hdl;
-	bool fasteoi;
-
-	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-	    trigger == IOAPIC_LEVEL) {
-		irq_set_status_flags(irq, IRQ_LEVEL);
-		fasteoi = true;
-	} else {
-		irq_clear_status_flags(irq, IRQ_LEVEL);
-		fasteoi = false;
-	}
-
-	if (setup_remapped_irq(irq, cfg, chip))
-		fasteoi = trigger != 0;
-
-	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
-	irq_set_chip_and_handler_name(irq, chip, hdl,
-				      fasteoi ? "fasteoi" : "edge");
-}
-#endif
-
-#ifndef CONFIG_XEN
-int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
-#else
-static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
 #endif
-			      unsigned int destination, int vector,
-			      struct io_apic_irq_attr *attr)
-{
-	memset(entry, 0, sizeof(*entry));
-
-	entry->delivery_mode = apic->irq_delivery_mode;
-	entry->dest_mode     = apic->irq_dest_mode;
-	entry->dest	     = destination;
-	entry->vector	     = vector;
-	entry->mask	     = 0;			/* enable IRQ */
-	entry->trigger	     = attr->trigger;
-	entry->polarity	     = attr->polarity;
-
-	/*
-	 * Mask level triggered irqs.
-	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
-	 */
-	if (attr->trigger)
-		entry->mask = 1;
-
-	return 0;
-}
-
-static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
-				struct io_apic_irq_attr *attr)
-{
-	struct IO_APIC_route_entry entry;
-	unsigned int dest;
-
-	if (!IO_APIC_IRQ(irq))
-		return;
-
-	if (assign_irq_vector(irq, cfg, apic->target_cpus()))
-		return;
-
-#ifndef CONFIG_XEN
-	if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
-					 &dest)) {
-		pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
-			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
-		clear_irq_vector(irq, cfg);
-
-		return;
-	}
-#else
-	dest = 0; /* meaningless */
-#endif
-
-	apic_printk(APIC_VERBOSE,KERN_DEBUG
-		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
-		    "IRQ %d Mode:%i Active:%i Dest:%d)\n",
-		    attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
-		    cfg->vector, irq, attr->trigger, attr->polarity, dest);
-
-#ifndef CONFIG_XEN
-	if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) {
-#else
-	if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
-#endif
-		pr_warn("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
-			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
-		clear_irq_vector(irq, cfg);
-
-		return;
-	}
-
-#ifndef CONFIG_XEN
-	ioapic_register_intr(irq, cfg, attr->trigger);
-	if (irq < nr_legacy_irqs())
-		legacy_pic->mask(irq);
-#else
-	evtchn_register_pirq(irq, mp_pin_to_gsi(attr->ioapic,
-						attr->ioapic_pin));
-#endif
-
-	ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
-}
 
 static void __init setup_IO_APIC_irqs(void)
 {
@@ -1386,106 +1316,41 @@ static void __init setup_IO_APIC_irqs(vo
 }
 
 #ifndef CONFIG_XEN
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
-					unsigned int pin, int vector)
-{
-	struct IO_APIC_route_entry entry;
-	unsigned int dest;
-
-	memset(&entry, 0, sizeof(entry));
-
-	/*
-	 * We use logical delivery to get the timer IRQ
-	 * to the first CPU.
-	 */
-	if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
-						  apic->target_cpus(), &dest)))
-		dest = BAD_APICID;
-
-	entry.dest_mode = apic->irq_dest_mode;
-	entry.mask = 0;			/* don't mask IRQ for edge */
-	entry.dest = dest;
-	entry.delivery_mode = apic->irq_delivery_mode;
-	entry.polarity = 0;
-	entry.trigger = 0;
-	entry.vector = vector;
-
-	/*
-	 * The timer IRQ doesn't have to know that behind the
-	 * scene we may have a 8259A-master in AEOI mode ...
-	 */
-	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
-				      "edge");
-
-	/*
-	 * Add it to the IO-APIC irq-routing table:
-	 */
-	ioapic_write_entry(ioapic_idx, pin, entry);
-}
-
-void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
+void ioapic_zap_locks(void)
 {
-	int i;
-
-	pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n");
-
-	for (i = 0; i <= nr_entries; i++) {
-		struct IO_APIC_route_entry entry;
-
-		entry = ioapic_read_entry(apic, i);
-
-		pr_debug(" %02x %02X  ", i, entry.dest);
-		pr_cont("%1d    %1d    %1d   %1d   %1d    "
-			"%1d    %1d    %02X\n",
-			entry.mask,
-			entry.trigger,
-			entry.irr,
-			entry.polarity,
-			entry.delivery_status,
-			entry.dest_mode,
-			entry.delivery_mode,
-			entry.vector);
-	}
+	raw_spin_lock_init(&ioapic_lock);
 }
 
-void intel_ir_io_apic_print_entries(unsigned int apic,
-				    unsigned int nr_entries)
+static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
 {
 	int i;
+	char buf[256];
+	struct IO_APIC_route_entry entry;
+	struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
 
-	pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n");
-
+	printk(KERN_DEBUG "IOAPIC %d:\n", apic);
 	for (i = 0; i <= nr_entries; i++) {
-		struct IR_IO_APIC_route_entry *ir_entry;
-		struct IO_APIC_route_entry entry;
-
 		entry = ioapic_read_entry(apic, i);
-
-		ir_entry = (struct IR_IO_APIC_route_entry *)&entry;
-
-		pr_debug(" %02x %04X ", i, ir_entry->index);
-		pr_cont("%1d   %1d    %1d    %1d   %1d   "
-			"%1d    %1d     %X    %02X\n",
-			ir_entry->format,
-			ir_entry->mask,
-			ir_entry->trigger,
-			ir_entry->irr,
-			ir_entry->polarity,
-			ir_entry->delivery_status,
-			ir_entry->index2,
-			ir_entry->zero,
-			ir_entry->vector);
+		snprintf(buf, sizeof(buf),
+			 " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+			 i,
+			 entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
+			 entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
+			 entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+			 entry.vector, entry.irr, entry.delivery_status);
+		if (ir_entry->format)
+			printk(KERN_DEBUG "%s, remapped, I(%04X),  Z(%X)\n",
+			       buf, (ir_entry->index << 15) | ir_entry->index,
+			       ir_entry->zero);
+		else
+			printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
+			       buf,
+			       entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
+			       "logical " : "physical",
+			       entry.dest, entry.delivery_mode);
 	}
 }
 
-void ioapic_zap_locks(void)
-{
-	raw_spin_lock_init(&ioapic_lock);
-}
-
 static void __init print_IO_APIC(int ioapic_idx)
 {
 	union IO_APIC_reg_00 reg_00;
@@ -1539,16 +1404,13 @@ static void __init print_IO_APIC(int ioa
 	}
 
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
-	x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries);
+	io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
 }
 
 void __init print_IO_APICs(void)
 {
 	int ioapic_idx;
-	struct irq_cfg *cfg;
 	unsigned int irq;
-	struct irq_chip *chip;
 
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for_each_ioapic(ioapic_idx)
@@ -1568,18 +1430,20 @@ void __init print_IO_APICs(void)
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	for_each_active_irq(irq) {
 		struct irq_pin_list *entry;
+		struct irq_chip *chip;
+		struct mp_chip_data *data;
 
 		chip = irq_get_chip(irq);
-		if (chip != &ioapic_chip)
+		if (chip != &ioapic_chip && chip != &ioapic_ir_chip)
 			continue;
-
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = irq_get_chip_data(irq);
+		if (!data)
 			continue;
-		if (list_empty(&cfg->irq_2_pin))
+		if (list_empty(&data->irq_2_pin))
 			continue;
+
 		printk(KERN_DEBUG "IRQ%d ", irq);
-		for_each_irq_pin(entry, cfg->irq_2_pin)
+		for_each_irq_pin(entry, data->irq_2_pin)
 			pr_cont("-> %d:%d", entry->apic, entry->pin);
 		pr_cont("\n");
 	}
@@ -1652,15 +1516,12 @@ void native_disable_io_apic(void)
 		struct IO_APIC_route_entry entry;
 
 		memset(&entry, 0, sizeof(entry));
-		entry.mask            = 0; /* Enabled */
-		entry.trigger         = 0; /* Edge */
-		entry.irr             = 0;
-		entry.polarity        = 0; /* High */
-		entry.delivery_status = 0;
-		entry.dest_mode       = 0; /* Physical */
-		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
-		entry.vector          = 0;
-		entry.dest            = read_apic_id();
+		entry.mask		= IOAPIC_UNMASKED;
+		entry.trigger		= IOAPIC_EDGE;
+		entry.polarity		= IOAPIC_POL_HIGH;
+		entry.dest_mode		= IOAPIC_DEST_MODE_PHYSICAL;
+		entry.delivery_mode	= dest_ExtINT;
+		entry.dest		= read_apic_id();
 
 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -1670,7 +1531,6 @@ void native_disable_io_apic(void)
 
 	if (cpu_has_apic || apic_from_smp_config())
 		disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-
 }
 
 /*
@@ -1880,7 +1740,6 @@ static int __init timer_irq_works(void)
  * This is not complete - we should be able to fake
  * an edge even if it isn't on the 8259A...
  */
-
 static unsigned int startup_ioapic_irq(struct irq_data *data)
 {
 	int was_pending = 0, irq = data->irq;
@@ -1892,74 +1751,22 @@ static unsigned int startup_ioapic_irq(s
 		if (legacy_pic->irq_pending(irq))
 			was_pending = 1;
 	}
-	__unmask_ioapic(irqd_cfg(data));
+	__unmask_ioapic(data->chip_data);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
 }
 
-/*
- * Level and edge triggered IO-APIC interrupts need different handling,
- * so we use two separate IRQ descriptors. Edge triggered IRQs can be
- * handled with the level-triggered descriptor, but that one has slightly
- * more overhead. Level-triggered interrupts cannot be handled with the
- * edge-triggered handler, without risking IRQ storms and other ugly
- * races.
- */
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
-	int apic, pin;
-	struct irq_pin_list *entry;
-	u8 vector = cfg->vector;
-
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
-		unsigned int reg;
-
-		apic = entry->apic;
-		pin = entry->pin;
-
-		io_apic_write(apic, 0x11 + pin*2, dest);
-		reg = io_apic_read(apic, 0x10 + pin*2);
-		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
-		reg |= vector;
-		io_apic_modify(apic, 0x10 + pin*2, reg);
-	}
-}
-
-int native_ioapic_set_affinity(struct irq_data *data,
-			       const struct cpumask *mask,
-			       bool force)
-{
-	unsigned int dest, irq = data->irq;
-	unsigned long flags;
-	int ret;
-
-	if (!config_enabled(CONFIG_SMP))
-		return -EPERM;
-
-	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	ret = apic_set_affinity(data, mask, &dest);
-	if (!ret) {
-		/* Only the high 8 bits are valid. */
-		dest = SET_APIC_LOGICAL_ID(dest);
-		__target_IO_APIC_irq(irq, dest, irqd_cfg(data));
-		ret = IRQ_SET_MASK_OK_NOCOPY;
-	}
-	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-	return ret;
-}
-
 atomic_t irq_mis_count;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+static bool io_apic_level_ack_pending(struct mp_chip_data *data)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
+	for_each_irq_pin(entry, data->irq_2_pin) {
 		unsigned int reg;
 		int pin;
 
@@ -1976,18 +1783,17 @@ static bool io_apic_level_ack_pending(st
 	return false;
 }
 
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
 {
 	/* If we are moving the irq we need to mask it */
 	if (unlikely(irqd_is_setaffinity_pending(data))) {
-		mask_ioapic(cfg);
+		mask_ioapic_irq(data);
 		return true;
 	}
 	return false;
 }
 
-static inline void ioapic_irqd_unmask(struct irq_data *data,
-				      struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
 {
 	if (unlikely(masked)) {
 		/* Only migrate the irq if the ack has been received.
@@ -2016,31 +1822,30 @@ static inline void ioapic_irqd_unmask(st
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		if (!io_apic_level_ack_pending(cfg))
+		if (!io_apic_level_ack_pending(data->chip_data))
 			irq_move_masked_irq(data);
-		unmask_ioapic(cfg);
+		unmask_ioapic_irq(data);
 	}
 }
 #else
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
 {
 	return false;
 }
-static inline void ioapic_irqd_unmask(struct irq_data *data,
-				      struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
 {
 }
 #endif
 
-static void ack_ioapic_level(struct irq_data *data)
+static void ioapic_ack_level(struct irq_data *irq_data)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	int i, irq = data->irq;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
 	unsigned long v;
 	bool masked;
+	int i;
 
 	irq_complete_move(cfg);
-	masked = ioapic_irqd_mask(data, cfg);
+	masked = ioapic_irqd_mask(irq_data);
 
 	/*
 	 * It appears there is an erratum which affects at least version 0x11
@@ -2092,11 +1897,49 @@ static void ack_ioapic_level(struct irq_
 	 */
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
+		eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
+	}
+
+	ioapic_irqd_unmask(irq_data, masked);
+}
+
+static void ioapic_ir_ack_level(struct irq_data *irq_data)
+{
+	struct mp_chip_data *data = irq_data->chip_data;
+
+	/*
+	 * Intr-remapping uses pin number as the virtual vector
+	 * in the RTE. Actual vector is programmed in
+	 * intr-remapping table entry. Hence for the io-apic
+	 * EOI we use the pin number.
+	 */
+	ack_APIC_irq();
+	eoi_ioapic_pin(data->entry.vector, data);
+}
+
+static int ioapic_set_affinity(struct irq_data *irq_data,
+			       const struct cpumask *mask, bool force)
+{
+	struct irq_data *parent = irq_data->parent_data;
+	struct mp_chip_data *data = irq_data->chip_data;
+	struct irq_pin_list *entry;
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	int ret;
 
-		eoi_ioapic_irq(irq, cfg);
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
+		cfg = irqd_cfg(irq_data);
+		data->entry.dest = cfg->dest_apicid;
+		data->entry.vector = cfg->vector;
+		for_each_irq_pin(entry, data->irq_2_pin)
+			__ioapic_write_entry(entry->apic, entry->pin,
+					     data->entry);
 	}
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	ioapic_irqd_unmask(data, cfg, masked);
+	return ret;
 }
 
 static struct irq_chip ioapic_chip __read_mostly = {
@@ -2104,10 +1947,20 @@ static struct irq_chip ioapic_chip __rea
 	.irq_startup		= startup_ioapic_irq,
 	.irq_mask		= mask_ioapic_irq,
 	.irq_unmask		= unmask_ioapic_irq,
-	.irq_ack		= apic_ack_edge,
-	.irq_eoi		= ack_ioapic_level,
-	.irq_set_affinity	= native_ioapic_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_eoi		= ioapic_ack_level,
+	.irq_set_affinity	= ioapic_set_affinity,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct irq_chip ioapic_ir_chip __read_mostly = {
+	.name			= "IR-IO-APIC",
+	.irq_startup		= startup_ioapic_irq,
+	.irq_mask		= mask_ioapic_irq,
+	.irq_unmask		= unmask_ioapic_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_eoi		= ioapic_ir_ack_level,
+	.irq_set_affinity	= ioapic_set_affinity,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 #endif /* !CONFIG_XEN */
@@ -2207,12 +2060,12 @@ static inline void __init unlock_ExtINT_
 
 	memset(&entry1, 0, sizeof(entry1));
 
-	entry1.dest_mode = 0;			/* physical delivery */
-	entry1.mask = 0;			/* unmask IRQ now */
+	entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+	entry1.mask = IOAPIC_UNMASKED;
 	entry1.dest = hard_smp_processor_id();
 	entry1.delivery_mode = dest_ExtINT;
 	entry1.polarity = entry0.polarity;
-	entry1.trigger = 0;
+	entry1.trigger = IOAPIC_EDGE;
 	entry1.vector = 0;
 
 	ioapic_write_entry(apic, pin, entry1);
@@ -2246,6 +2099,25 @@ static int __init disable_timer_pin_setu
 }
 early_param("disable_timer_pin_1", disable_timer_pin_setup);
 
+static int mp_alloc_timer_irq(int ioapic, int pin)
+{
+	int irq = -1;
+	struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+
+	if (domain) {
+		struct irq_alloc_info info;
+
+		ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
+		info.ioapic_id = mpc_ioapic_id(ioapic);
+		info.ioapic_pin = pin;
+		mutex_lock(&ioapic_mutex);
+		irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
+		mutex_unlock(&ioapic_mutex);
+	}
+
+	return irq;
+}
+
 /*
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
@@ -2256,7 +2128,9 @@ early_param("disable_timer_pin_1", disab
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_cfg(0);
+	struct irq_data *irq_data = irq_get_irq_data(0);
+	struct mp_chip_data *data = irq_data->chip_data;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
 	int node = cpu_to_node(0);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
@@ -2268,7 +2142,6 @@ static inline void __init check_timer(vo
 	 * get/set the timer IRQ vector:
 	 */
 	legacy_pic->mask(0);
-	assign_irq_vector(0, cfg, apic->target_cpus());
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2309,23 +2182,21 @@ static inline void __init check_timer(vo
 	}
 
 	if (pin1 != -1) {
-		/*
-		 * Ok, does IRQ0 through the IOAPIC work?
-		 */
+		/* Ok, does IRQ0 through the IOAPIC work? */
 		if (no_pin1) {
-			add_pin_to_irq_node(cfg, node, apic1, pin1);
-			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+			mp_alloc_timer_irq(apic1, pin1);
 		} else {
-			/* for edge trigger, setup_ioapic_irq already
-			 * leave it unmasked.
+			/*
+			 * for edge trigger, it's already unmasked,
 			 * so only need to unmask if it is level-trigger
 			 * do we really have level trigger timer?
 			 */
 			int idx;
 			idx = find_irq_entry(apic1, pin1, mp_INT);
 			if (idx != -1 && irq_trigger(idx))
-				unmask_ioapic(cfg);
+				unmask_ioapic_irq(irq_get_chip_data(0));
 		}
+		irq_domain_activate_irq(irq_data);
 		if (timer_irq_works()) {
 			if (disable_timer_pin_1 > 0)
 				clear_IO_APIC_pin(0, pin1);
@@ -2345,8 +2216,8 @@ static inline void __init check_timer(vo
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
-		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+		replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+		irq_domain_activate_irq(irq_data);
 		legacy_pic->unmask(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2426,36 +2297,35 @@ out:
 
 static int mp_irqdomain_create(int ioapic)
 {
-	size_t size;
+	struct irq_alloc_info info;
+	struct irq_domain *parent;
 	int hwirqs = mp_ioapic_pin_count(ioapic);
 	struct ioapic *ip = &ioapics[ioapic];
 	struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
 	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
 
-	size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic);
-	ip->pin_info = kzalloc(size, GFP_KERNEL);
-	if (!ip->pin_info)
-		return -ENOMEM;
-
 	if (cfg->type == IOAPIC_DOMAIN_INVALID)
 		return 0;
 
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	info.ioapic_id = mpc_ioapic_id(ioapic);
+	parent = irq_remapping_get_ir_irq_domain(&info);
+	if (!parent)
+		parent = x86_vector_domain;
+
 	ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
 					      (void *)(long)ioapic);
-	if(!ip->irqdomain) {
-		kfree(ip->pin_info);
-		ip->pin_info = NULL;
+	if (!ip->irqdomain)
 		return -ENOMEM;
-	}
+
+	ip->irqdomain->parent = parent;
 
 	if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
 	    cfg->type == IOAPIC_DOMAIN_STRICT)
 		ioapic_dynirq_base = max(ioapic_dynirq_base,
 					 gsi_cfg->gsi_end + 1);
 
-	if (gsi_cfg->gsi_base == 0)
-		irq_set_default_host(ip->irqdomain);
-
 	return 0;
 }
 
@@ -2465,8 +2335,6 @@ static void ioapic_destroy_irqdomain(int
 		irq_domain_remove(ioapics[idx].irqdomain);
 		ioapics[idx].irqdomain = NULL;
 	}
-	kfree(ioapics[idx].pin_info);
-	ioapics[idx].pin_info = NULL;
 }
 
 void __init setup_IO_APIC(void)
@@ -2498,28 +2366,6 @@ void __init setup_IO_APIC(void)
 	ioapic_initialized = 1;
 }
 
-/*
- *      Called after all the initialization is done. If we didn't find any
- *      APIC bugs then we can allow the modify fast path
- */
-
-static int __init io_apic_bug_finalize(void)
-{
-	if (sis_apic_bug == -1)
-		sis_apic_bug = 0;
-#ifdef CONFIG_X86_XEN
-	if (is_initial_xendomain()) {
-		struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
-		op.u.platform_quirk.quirk_id = sis_apic_bug ?
-			QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
-		VOID(HYPERVISOR_platform_op(&op));
-	}
-#endif
-	return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
 #ifndef CONFIG_XEN
 static void resume_ioapic_id(int ioapic_idx)
 {
@@ -2560,20 +2406,6 @@ static int __init ioapic_init_ops(void)
 device_initcall(ioapic_init_ops);
 #endif /* !CONFIG_XEN */
 
-static int
-io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
-{
-	struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
-	int ret;
-
-	if (!cfg)
-		return -EINVAL;
-	ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
-	if (!ret)
-		setup_ioapic_irq(irq, cfg, attr);
-	return ret;
-}
-
 static int io_apic_get_redir_entries(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
@@ -2806,7 +2638,7 @@ void __init setup_ioapic_dest(void)
 		else
 			mask = apic->target_cpus();
 
-		x86_io_apic_ops.set_affinity(idata, mask, false);
+		irq_set_affinity(irq, mask);
 	}
 
 }
@@ -2851,7 +2683,7 @@ static struct resource * __init ioapic_s
 	return res;
 }
 
-void __init native_io_apic_init_mappings(void)
+void __init io_apic_init_mappings(void)
 {
 	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
 	struct resource *ioapic_res;
@@ -3085,7 +2917,6 @@ int mp_unregister_ioapic(u32 gsi_base)
 {
 	int ioapic, pin;
 	int found = 0;
-	struct mp_pin_info *pin_info;
 
 	for_each_ioapic(ioapic)
 		if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
@@ -3098,11 +2929,17 @@ int mp_unregister_ioapic(u32 gsi_base)
 	}
 
 	for_each_pin(ioapic, pin) {
-		pin_info = mp_pin_info(ioapic, pin);
-		if (pin_info->count) {
-			pr_warn("pin%d on IOAPIC%d is still in use.\n",
-				pin, ioapic);
-			return -EBUSY;
+		u32 gsi = mp_pin_to_gsi(ioapic, pin);
+		int irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+		struct mp_chip_data *data;
+
+		if (irq >= 0) {
+			data = irq_get_chip_data(irq);
+			if (data && data->count) {
+				pr_warn("pin%d on IOAPIC%d is still in use.\n",
+					pin, ioapic);
+				return -EBUSY;
+			}
 		}
 	}
 
@@ -3131,112 +2968,163 @@ int mp_ioapic_registered(u32 gsi_base)
 	return 0;
 }
 
-static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
-					int ioapic, int ioapic_pin,
-					int trigger, int polarity)
+static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
+				  struct irq_alloc_info *info)
 {
-	irq_attr->ioapic	= ioapic;
-	irq_attr->ioapic_pin	= ioapic_pin;
-	irq_attr->trigger	= trigger;
-	irq_attr->polarity	= polarity;
+	if (info && info->ioapic_valid) {
+		data->trigger = info->ioapic_trigger;
+		data->polarity = info->ioapic_polarity;
+	} else if (acpi_get_override_irq(gsi, &data->trigger,
+					 &data->polarity) < 0) {
+		/* PCI interrupts are always active low level triggered. */
+		data->trigger = IOAPIC_LEVEL;
+		data->polarity = IOAPIC_POL_LOW;
+	}
+}
+
+static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
+			   struct IO_APIC_route_entry *entry)
+{
+	memset(entry, 0, sizeof(*entry));
+	entry->delivery_mode = apic->irq_delivery_mode;
+	entry->dest_mode     = apic->irq_dest_mode;
+#ifndef CONFIG_XEN /* meaningless on Xen */
+	entry->dest	     = cfg->dest_apicid;
+#endif
+	entry->vector	     = cfg->vector;
+	entry->trigger	     = data->trigger;
+	entry->polarity	     = data->polarity;
+	/*
+	 * Mask level triggered irqs. Edge triggered irqs are masked
+	 * by the irq core code in case they fire.
+	 */
+	if (data->trigger == IOAPIC_LEVEL)
+		entry->mask = IOAPIC_MASKED;
+	else
+		entry->mask = IOAPIC_UNMASKED;
 }
 
-int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
-		     irq_hw_number_t hwirq)
+int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+		       unsigned int nr_irqs, void *arg)
 {
-	int ioapic = (int)(long)domain->host_data;
-	struct mp_pin_info *info = mp_pin_info(ioapic, hwirq);
-	struct io_apic_irq_attr attr;
+	int ret, ioapic, pin;
+	struct irq_cfg *cfg;
+	struct irq_data *irq_data;
+	struct mp_chip_data *data;
+	struct irq_alloc_info *info = arg;
 
-	/* Get default attribute if not set by caller yet */
-	if (!info->set) {
-		u32 gsi = mp_pin_to_gsi(ioapic, hwirq);
+	if (!info || nr_irqs > 1)
+		return -EINVAL;
+	irq_data = irq_domain_get_irq_data(domain, virq);
+	if (!irq_data)
+		return -EINVAL;
 
-		if (acpi_get_override_irq(gsi, &info->trigger,
-					  &info->polarity) < 0) {
-			/*
-			 * PCI interrupts are always polarity one level
-			 * triggered.
-			 */
-			info->trigger = 1;
-			info->polarity = 1;
-		}
-		info->node = NUMA_NO_NODE;
+	ioapic = mp_irqdomain_ioapic_idx(domain);
+	pin = info->ioapic_pin;
+	if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+		return -EEXIST;
 
-		/*
-		 * setup_IO_APIC_irqs() programs all legacy IRQs with default
-		 * trigger and polarity attributes. Don't set the flag for that
-		 * case so the first legacy IRQ user could reprogram the pin
-		 * with real trigger and polarity attributes.
-		 */
-		if (virq >= nr_legacy_irqs() || info->count)
-			info->set = 1;
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	info->ioapic_entry = &data->entry;
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+	if (ret < 0) {
+		kfree(data);
+		return ret;
 	}
-	set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger,
-			     info->polarity);
 
-	return io_apic_setup_irq_pin(virq, info->node, &attr);
-}
+#ifndef CONFIG_XEN
+	INIT_LIST_HEAD(&data->irq_2_pin);
+#endif
+	irq_data->hwirq = info->ioapic_pin;
+#ifndef CONFIG_XEN
+	irq_data->chip = (domain->parent == x86_vector_domain) ?
+			  &ioapic_chip : &ioapic_ir_chip;
+#endif
+	irq_data->chip_data = data;
+	mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
 
-void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq)
-{
+	cfg = irqd_cfg(irq_data);
+	add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+	if (info->ioapic_entry)
+		mp_setup_entry(cfg, data, info->ioapic_entry);
 #ifndef CONFIG_XEN
-	struct irq_data *data = irq_get_irq_data(virq);
-	struct irq_cfg *cfg = irq_cfg(virq);
-	int ioapic = (int)(long)domain->host_data;
-	int pin = (int)data->hwirq;
+	mp_register_handler(virq, data->trigger);
+	if (virq < nr_legacy_irqs())
+		legacy_pic->mask(virq);
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG
+		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
+		    ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
+		    virq, data->trigger, data->polarity, cfg->dest_apicid);
+#else
+	evtchn_register_pirq(virq, mp_pin_to_gsi(ioapic, pin));
 
-	ioapic_mask_entry(ioapic, pin);
-	__remove_pin_from_irq(cfg, ioapic, pin);
-	WARN_ON(!list_empty(&cfg->irq_2_pin));
-	arch_teardown_hwirq(virq);
+	apic_printk(APIC_VERBOSE, KERN_DEBUG
+		    "IOAPIC[%d]: Set routing entry (%d-%d -> %#x -> IRQ %d Mode:%i Active:%i)\n",
+		    ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
+		    virq, data->trigger, data->polarity);
 #endif
+
+	return 0;
 }
 
-int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node)
+void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+		       unsigned int nr_irqs)
 {
-	int ret = 0;
-	int ioapic, pin;
-	struct mp_pin_info *info;
+	struct irq_data *irq_data;
+	struct mp_chip_data *data;
 
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return -ENODEV;
-
-	pin = mp_find_ioapic_pin(ioapic, gsi);
-	info = mp_pin_info(ioapic, pin);
-	trigger = trigger ? 1 : 0;
-	polarity = polarity ? 1 : 0;
-
-	mutex_lock(&ioapic_mutex);
-	if (!info->set) {
-		info->trigger = trigger;
-		info->polarity = polarity;
-		info->node = node;
-		info->set = 1;
-	} else if (info->trigger != trigger || info->polarity != polarity) {
-		ret = -EBUSY;
+	BUG_ON(nr_irqs != 1);
+	irq_data = irq_domain_get_irq_data(domain, virq);
+	if (irq_data && irq_data->chip_data) {
+		data = irq_data->chip_data;
+#ifndef CONFIG_XEN
+		__remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain),
+				      (int)irq_data->hwirq);
+		WARN_ON(!list_empty(&data->irq_2_pin));
+#endif
+		kfree(data);
 	}
-	mutex_unlock(&ioapic_mutex);
-
-	return ret;
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
-#ifdef CONFIG_X86_MRST
-/* Enable IOAPIC early just for system timer */
-void __init pre_init_apic_IRQ0(void)
+void mp_irqdomain_activate(struct irq_domain *domain,
+			   struct irq_data *irq_data)
 {
-	struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
+	struct mp_chip_data *data = irq_data->chip_data;
+#ifndef CONFIG_XEN
+	unsigned long flags;
+	struct irq_pin_list *entry;
 
-	printk(KERN_INFO "Early APIC setup for system timer0\n");
-#ifndef CONFIG_SMP
-	physid_set_mask_of_physid(boot_cpu_physical_apicid,
-					 &phys_cpu_present_map);
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	for_each_irq_pin(entry, data->irq_2_pin)
+		__ioapic_write_entry(entry->apic, entry->pin, data->entry);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+#else
+	ioapic_write_entry(mp_irqdomain_ioapic_idx(domain), irq_data->hwirq,
+			   data->entry);
 #endif
-	setup_local_APIC();
+}
 
-	io_apic_setup_irq_pin(0, 0, &attr);
-	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
-				      "edge");
+void mp_irqdomain_deactivate(struct irq_domain *domain,
+			     struct irq_data *irq_data)
+{
+	/* It won't be called for IRQ with multiple IOAPIC pins associated */
+	ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain),
+			  (int)irq_data->hwirq);
 }
-#endif
+
+int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
+{
+	return (int)(long)domain->host_data;
+}
+
+const struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+	.alloc		= mp_irqdomain_alloc,
+	.free		= mp_irqdomain_free,
+	.activate	= mp_irqdomain_activate,
+	.deactivate	= mp_irqdomain_deactivate,
+};
--- a/arch/x86/kernel/apic/vector-xen.c
+++ b/arch/x86/kernel/apic/vector-xen.c
@@ -3,6 +3,8 @@
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Enable support of hierarchical irqdomains
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -11,8 +13,8 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/irqdomain.h>
 #include <linux/slab.h>
+#include <asm/irqdomain.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/i8259.h>
@@ -20,7 +22,20 @@
 #include <asm/irq_remapping.h>
 
 #ifndef CONFIG_XEN
+struct apic_chip_data {
+	struct irq_cfg		cfg;
+	cpumask_var_t		domain;
+	cpumask_var_t		old_domain;
+	u8			move_in_progress : 1;
+};
+
+struct irq_domain *x86_vector_domain;
 static DEFINE_RAW_SPINLOCK(vector_lock);
+static cpumask_var_t vector_cpumask;
+static struct irq_chip lapic_controller;
+#ifdef	CONFIG_X86_IO_APIC
+static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
+#endif
 
 void lock_vector_lock(void)
 {
@@ -34,74 +49,66 @@ void unlock_vector_lock(void)
 {
 	raw_spin_unlock(&vector_lock);
 }
-#endif /* CONFIG_XEN */
 
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data)
+#else
+struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
+#endif /* CONFIG_XEN */
 {
-	return irq_get_chip_data(irq);
+	if (!irq_data)
+		return NULL;
+
+	while (irq_data->parent_data)
+		irq_data = irq_data->parent_data;
+
+	return irq_data->chip_data;
 }
 
+#ifndef CONFIG_XEN
 struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
 {
-	return irq_data->chip_data;
+	struct apic_chip_data *data = apic_chip_data(irq_data);
+
+	return data ? &data->cfg : NULL;
+}
+#endif
+
+struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	return irqd_cfg(irq_get_irq_data(irq));
 }
 
 #ifndef CONFIG_XEN
-static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+static struct apic_chip_data *alloc_apic_chip_data(int node)
 {
-	struct irq_cfg *cfg;
+	struct apic_chip_data *data;
 
-	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
-	if (!cfg)
+	data = kzalloc_node(sizeof(*data), GFP_KERNEL, node);
+	if (!data)
 		return NULL;
-	if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
-		goto out_cfg;
-	if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+	if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node))
+		goto out_data;
+	if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
 		goto out_domain;
-#ifdef	CONFIG_X86_IO_APIC
-	INIT_LIST_HEAD(&cfg->irq_2_pin);
-#endif
-	return cfg;
+	return data;
 out_domain:
-	free_cpumask_var(cfg->domain);
-out_cfg:
-	kfree(cfg);
+	free_cpumask_var(data->domain);
+out_data:
+	kfree(data);
 	return NULL;
 }
 
-struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+static void free_apic_chip_data(struct apic_chip_data *data)
 {
-	int res = irq_alloc_desc_at(at, node);
-	struct irq_cfg *cfg;
-
-	if (res < 0) {
-		if (res != -EEXIST)
-			return NULL;
-		cfg = irq_cfg(at);
-		if (cfg)
-			return cfg;
+	if (data) {
+		free_cpumask_var(data->domain);
+		free_cpumask_var(data->old_domain);
+		kfree(data);
 	}
-
-	cfg = alloc_irq_cfg(at, node);
-	if (cfg)
-		irq_set_chip_data(at, cfg);
-	else
-		irq_free_desc(at);
-	return cfg;
 }
 
-static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
-{
-	if (!cfg)
-		return;
-	irq_set_chip_data(at, NULL);
-	free_cpumask_var(cfg->domain);
-	free_cpumask_var(cfg->old_domain);
-	kfree(cfg);
-}
-
-static int
-__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int __assign_irq_vector(int irq, struct apic_chip_data *d,
+			       const struct cpumask *mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -117,36 +124,33 @@ __assign_irq_vector(int irq, struct irq_
 	static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
 	static int current_offset = VECTOR_OFFSET_START % 16;
 	int cpu, err;
-	cpumask_var_t tmp_mask;
 
-	if (cfg->move_in_progress)
+	if (d->move_in_progress)
 		return -EBUSY;
 
-	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
-		return -ENOMEM;
-
 	/* Only try and allocate irqs on cpus that are present */
 	err = -ENOSPC;
-	cpumask_clear(cfg->old_domain);
+	cpumask_clear(d->old_domain);
 	cpu = cpumask_first_and(mask, cpu_online_mask);
 	while (cpu < nr_cpu_ids) {
 		int new_cpu, vector, offset;
 
-		apic->vector_allocation_domain(cpu, tmp_mask, mask);
+		apic->vector_allocation_domain(cpu, vector_cpumask, mask);
 
-		if (cpumask_subset(tmp_mask, cfg->domain)) {
+		if (cpumask_subset(vector_cpumask, d->domain)) {
 			err = 0;
-			if (cpumask_equal(tmp_mask, cfg->domain))
+			if (cpumask_equal(vector_cpumask, d->domain))
 				break;
 			/*
 			 * New cpumask using the vector is a proper subset of
 			 * the current in use mask. So cleanup the vector
 			 * allocation for the members that are not used anymore.
 			 */
-			cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
-			cfg->move_in_progress =
-			   cpumask_intersects(cfg->old_domain, cpu_online_mask);
-			cpumask_and(cfg->domain, cfg->domain, tmp_mask);
+			cpumask_andnot(d->old_domain, d->domain,
+				       vector_cpumask);
+			d->move_in_progress =
+			   cpumask_intersects(d->old_domain, cpu_online_mask);
+			cpumask_and(d->domain, d->domain, vector_cpumask);
 			break;
 		}
 
@@ -160,16 +164,18 @@ next:
 		}
 
 		if (unlikely(current_vector == vector)) {
-			cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
-			cpumask_andnot(tmp_mask, mask, cfg->old_domain);
-			cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
+			cpumask_or(d->old_domain, d->old_domain,
+				   vector_cpumask);
+			cpumask_andnot(vector_cpumask, mask, d->old_domain);
+			cpu = cpumask_first_and(vector_cpumask,
+						cpu_online_mask);
 			continue;
 		}
 
 		if (test_bit(vector, used_vectors))
 			goto next;
 
-		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
 			if (per_cpu(vector_irq, new_cpu)[vector] >
 			    VECTOR_UNDEFINED)
 				goto next;
@@ -177,57 +183,73 @@ next:
 		/* Found one! */
 		current_vector = vector;
 		current_offset = offset;
-		if (cfg->vector) {
-			cpumask_copy(cfg->old_domain, cfg->domain);
-			cfg->move_in_progress =
-			   cpumask_intersects(cfg->old_domain, cpu_online_mask);
+		if (d->cfg.vector) {
+			cpumask_copy(d->old_domain, d->domain);
+			d->move_in_progress =
+			   cpumask_intersects(d->old_domain, cpu_online_mask);
 		}
-		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
 			per_cpu(vector_irq, new_cpu)[vector] = irq;
-		cfg->vector = vector;
-		cpumask_copy(cfg->domain, tmp_mask);
+		d->cfg.vector = vector;
+		cpumask_copy(d->domain, vector_cpumask);
 		err = 0;
 		break;
 	}
-	free_cpumask_var(tmp_mask);
+
+	if (!err) {
+		/* cache destination APIC IDs into cfg->dest_apicid */
+		err = apic->cpu_mask_to_apicid_and(mask, d->domain,
+						   &d->cfg.dest_apicid);
+	}
 
 	return err;
 }
 
-int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int assign_irq_vector(int irq, struct apic_chip_data *data,
+			     const struct cpumask *mask)
 {
 	int err;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, cfg, mask);
+	err = __assign_irq_vector(irq, data, mask);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
-#endif /* CONFIG_XEN */
 
-void clear_irq_vector(int irq, struct irq_cfg *cfg)
+static int assign_irq_vector_policy(int irq, int node,
+				    struct apic_chip_data *data,
+				    struct irq_alloc_info *info)
+{
+	if (info && info->mask)
+		return assign_irq_vector(irq, data, info->mask);
+	if (node != NUMA_NO_NODE &&
+	    assign_irq_vector(irq, data, cpumask_of_node(node)) == 0)
+		return 0;
+	return assign_irq_vector(irq, data, apic->target_cpus());
+}
+
+static void clear_irq_vector(int irq, struct apic_chip_data *data)
 {
-#ifndef CONFIG_XEN
 	int cpu, vector;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	BUG_ON(!cfg->vector);
+	BUG_ON(!data->cfg.vector);
 
-	vector = cfg->vector;
-	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+	vector = data->cfg.vector;
+	for_each_cpu_and(cpu, data->domain, cpu_online_mask)
 		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 
-	cfg->vector = 0;
-	cpumask_clear(cfg->domain);
+	data->cfg.vector = 0;
+	cpumask_clear(data->domain);
 
-	if (likely(!cfg->move_in_progress)) {
+	if (likely(!data->move_in_progress)) {
 		raw_spin_unlock_irqrestore(&vector_lock, flags);
 		return;
 	}
 
-	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+	for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
 		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
 		     vector++) {
 			if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -236,12 +258,97 @@ void clear_irq_vector(int irq, struct ir
 			break;
 		}
 	}
-	cfg->move_in_progress = 0;
+	data->move_in_progress = 0;
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
+}
 #endif /* CONFIG_XEN */
+
+void init_irq_alloc_info(struct irq_alloc_info *info,
+			 const struct cpumask *mask)
+{
+	memset(info, 0, sizeof(*info));
+	info->mask = mask;
+}
+
+void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
+{
+	if (src)
+		*dst = *src;
+	else
+		memset(dst, 0, sizeof(*dst));
 }
 
 #ifndef CONFIG_XEN
+static void x86_vector_free_irqs(struct irq_domain *domain,
+				 unsigned int virq, unsigned int nr_irqs)
+{
+	struct irq_data *irq_data;
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
+		if (irq_data && irq_data->chip_data) {
+			clear_irq_vector(virq + i, irq_data->chip_data);
+			free_apic_chip_data(irq_data->chip_data);
+#ifdef	CONFIG_X86_IO_APIC
+			if (virq + i < nr_legacy_irqs())
+				legacy_irq_data[virq + i] = NULL;
+#endif
+			irq_domain_reset_irq_data(irq_data);
+		}
+	}
+}
+
+static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+				 unsigned int nr_irqs, void *arg)
+{
+	struct irq_alloc_info *info = arg;
+	struct apic_chip_data *data;
+	struct irq_data *irq_data;
+	int i, err;
+
+	if (disable_apic)
+		return -ENXIO;
+
+	/* Currently vector allocator can't guarantee contiguous allocations */
+	if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
+		return -ENOSYS;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq + i);
+		BUG_ON(!irq_data);
+#ifdef	CONFIG_X86_IO_APIC
+		if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
+			data = legacy_irq_data[virq + i];
+		else
+#endif
+			data = alloc_apic_chip_data(irq_data->node);
+		if (!data) {
+			err = -ENOMEM;
+			goto error;
+		}
+
+		irq_data->chip = &lapic_controller;
+		irq_data->chip_data = data;
+		irq_data->hwirq = virq + i;
+		err = assign_irq_vector_policy(virq + i, irq_data->node, data,
+					       info);
+		if (err)
+			goto error;
+	}
+
+	return 0;
+
+error:
+	x86_vector_free_irqs(domain, virq, i + 1);
+	return err;
+}
+
+static const struct irq_domain_ops x86_vector_domain_ops = {
+	.alloc	= x86_vector_alloc_irqs,
+	.free	= x86_vector_free_irqs,
+};
+
 int __init arch_probe_nr_irqs(void)
 {
 	int nr;
@@ -265,8 +372,43 @@ int __init arch_probe_nr_irqs(void)
 	return nr_legacy_irqs();
 }
 
+#ifdef	CONFIG_X86_IO_APIC
+static void init_legacy_irqs(void)
+{
+	int i, node = cpu_to_node(0);
+	struct apic_chip_data *data;
+
+	/*
+	 * For legacy IRQ's, start with assigning irq0 to irq15 to
+	 * ISA_IRQ_VECTOR(i) for all cpu's.
+	 */
+	for (i = 0; i < nr_legacy_irqs(); i++) {
+		data = legacy_irq_data[i] = alloc_apic_chip_data(node);
+		BUG_ON(!data);
+
+		data->cfg.vector = ISA_IRQ_VECTOR(i);
+		cpumask_setall(data->domain);
+		irq_set_chip_data(i, data);
+	}
+}
+#else
+static void init_legacy_irqs(void) { }
+#endif
+
 int __init arch_early_irq_init(void)
 {
+	init_legacy_irqs();
+
+	x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops,
+						NULL);
+	BUG_ON(x86_vector_domain == NULL);
+	irq_set_default_host(x86_vector_domain);
+
+	arch_init_msi_domain(x86_vector_domain);
+	arch_init_htirq_domain(x86_vector_domain);
+
+	BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+
 	return arch_early_ioapic_init();
 }
 
@@ -274,23 +416,17 @@ static void __setup_vector_irq(int cpu)
 {
 	/* Initialize vector_irq on a new cpu */
 	int irq, vector;
-	struct irq_cfg *cfg;
+	struct apic_chip_data *data;
 
-	/*
-	 * vector_lock will make sure that we don't run into irq vector
-	 * assignments that might be happening on another cpu in parallel,
-	 * while we setup our initial vector to irq mappings.
-	 */
-	raw_spin_lock(&vector_lock);
 	/* Mark the inuse vectors */
 	for_each_active_irq(irq) {
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = apic_chip_data(irq_get_irq_data(irq));
+		if (!data)
 			continue;
 
-		if (!cpumask_test_cpu(cpu, cfg->domain))
+		if (!cpumask_test_cpu(cpu, data->domain))
 			continue;
-		vector = cfg->vector;
+		vector = data->cfg.vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
 	}
 	/* Mark the free vectors */
@@ -299,20 +435,20 @@ static void __setup_vector_irq(int cpu)
 		if (irq <= VECTOR_UNDEFINED)
 			continue;
 
-		cfg = irq_cfg(irq);
-		if (!cpumask_test_cpu(cpu, cfg->domain))
+		data = apic_chip_data(irq_get_irq_data(irq));
+		if (!cpumask_test_cpu(cpu, data->domain))
 			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 	}
-	raw_spin_unlock(&vector_lock);
 }
 
 /*
- * Setup the vector to irq mappings.
+ * Setup the vector to irq mappings. Must be called with vector_lock held.
  */
 void setup_vector_irq(int cpu)
 {
 	int irq;
 
+	lockdep_assert_held(&vector_lock);
 	/*
 	 * On most of the platforms, legacy PIC delivers the interrupts on the
 	 * boot cpu. But there are certain platforms where PIC interrupts are
@@ -321,20 +457,20 @@ void setup_vector_irq(int cpu)
 	 * legacy vector to irq mapping:
 	 */
 	for (irq = 0; irq < nr_legacy_irqs(); irq++)
-		per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
+		per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq;
 
 	__setup_vector_irq(cpu);
 }
 
-int apic_retrigger_irq(struct irq_data *data)
+static int apic_retrigger_irq(struct irq_data *irq_data)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
+	struct apic_chip_data *data = apic_chip_data(irq_data);
 	unsigned long flags;
 	int cpu;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	cpu = cpumask_first_and(cfg->domain, cpu_online_mask);
-	apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
+	cpu = cpumask_first_and(data->domain, cpu_online_mask);
+	apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 
 	return 1;
@@ -347,73 +483,76 @@ void apic_ack_edge(struct irq_data *data
 	ack_APIC_irq();
 }
 
-/*
- * Either sets data->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves data->affinity untouched.
- */
-int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		      unsigned int *dest_id)
+static int apic_set_affinity(struct irq_data *irq_data,
+			     const struct cpumask *dest, bool force)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int irq = data->irq;
-	int err;
+	struct apic_chip_data *data = irq_data->chip_data;
+	int err, irq = irq_data->irq;
 
 	if (!config_enabled(CONFIG_SMP))
 		return -EPERM;
 
-	if (!cpumask_intersects(mask, cpu_online_mask))
+	if (!cpumask_intersects(dest, cpu_online_mask))
 		return -EINVAL;
 
-	err = assign_irq_vector(irq, cfg, mask);
-	if (err)
-		return err;
-
-	err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
+	err = assign_irq_vector(irq, data, dest);
 	if (err) {
-		if (assign_irq_vector(irq, cfg, data->affinity))
+		struct irq_data *top = irq_get_irq_data(irq);
+
+		if (assign_irq_vector(irq, data, top->affinity))
 			pr_err("Failed to recover vector for irq %d\n", irq);
 		return err;
 	}
 
-	cpumask_copy(data->affinity, mask);
-
-	return 0;
+	return IRQ_SET_MASK_OK;
 }
 
+static struct irq_chip lapic_controller = {
+	.irq_ack		= apic_ack_edge,
+	.irq_set_affinity	= apic_set_affinity,
+	.irq_retrigger		= apic_retrigger_irq,
+};
+
 #ifdef CONFIG_SMP
-void send_cleanup_vector(struct irq_cfg *cfg)
+static void __send_cleanup_vector(struct apic_chip_data *data)
 {
 	cpumask_var_t cleanup_mask;
 
 	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
 		unsigned int i;
 
-		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+		for_each_cpu_and(i, data->old_domain, cpu_online_mask)
 			apic->send_IPI_mask(cpumask_of(i),
 					    IRQ_MOVE_CLEANUP_VECTOR);
 	} else {
-		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+		cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
 		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 		free_cpumask_var(cleanup_mask);
 	}
-	cfg->move_in_progress = 0;
+	data->move_in_progress = 0;
+}
+
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+	struct apic_chip_data *data;
+
+	data = container_of(cfg, struct apic_chip_data, cfg);
+	if (data->move_in_progress)
+		__send_cleanup_vector(data);
 }
 
 asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 {
 	unsigned vector, me;
 
-	ack_APIC_irq();
-	irq_enter();
-	exit_idle();
+	entering_ack_irq();
 
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		int irq;
 		unsigned int irr;
 		struct irq_desc *desc;
-		struct irq_cfg *cfg;
+		struct apic_chip_data *data;
 
 		irq = __this_cpu_read(vector_irq[vector]);
 
@@ -424,8 +563,8 @@ asmlinkage __visible void smp_irq_move_c
 		if (!desc)
 			continue;
 
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = apic_chip_data(&desc->irq_data);
+		if (!data)
 			continue;
 
 		raw_spin_lock(&desc->lock);
@@ -434,10 +573,11 @@ asmlinkage __visible void smp_irq_move_c
 		 * Check if the irq migration is in progress. If so, we
 		 * haven't received the cleanup request yet for this irq.
 		 */
-		if (cfg->move_in_progress)
+		if (data->move_in_progress)
 			goto unlock;
 
-		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+		if (vector == data->cfg.vector &&
+		    cpumask_test_cpu(me, data->domain))
 			goto unlock;
 
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -457,20 +597,21 @@ unlock:
 		raw_spin_unlock(&desc->lock);
 	}
 
-	irq_exit();
+	exiting_irq();
 }
 
 static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
 {
 	unsigned me;
+	struct apic_chip_data *data;
 
-	if (likely(!cfg->move_in_progress))
+	data = container_of(cfg, struct apic_chip_data, cfg);
+	if (likely(!data->move_in_progress))
 		return;
 
 	me = smp_processor_id();
-
-	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
-		send_cleanup_vector(cfg);
+	if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain))
+		__send_cleanup_vector(data);
 }
 
 void irq_complete_move(struct irq_cfg *cfg)
@@ -482,46 +623,11 @@ void irq_force_complete_move(int irq)
 {
 	struct irq_cfg *cfg = irq_cfg(irq);
 
-	if (!cfg)
-		return;
-
-	__irq_complete_move(cfg, cfg->vector);
+	if (cfg)
+		__irq_complete_move(cfg, cfg->vector);
 }
 #endif
 
-/*
- * Dynamic irq allocate and deallocation. Should be replaced by irq domains!
- */
-int arch_setup_hwirq(unsigned int irq, int node)
-{
-	struct irq_cfg *cfg;
-	unsigned long flags;
-	int ret;
-
-	cfg = alloc_irq_cfg(irq, node);
-	if (!cfg)
-		return -ENOMEM;
-
-	raw_spin_lock_irqsave(&vector_lock, flags);
-	ret = __assign_irq_vector(irq, cfg, apic->target_cpus());
-	raw_spin_unlock_irqrestore(&vector_lock, flags);
-
-	if (!ret)
-		irq_set_chip_data(irq, cfg);
-	else
-		free_irq_cfg(irq, cfg);
-	return ret;
-}
-
-void arch_teardown_hwirq(unsigned int irq)
-{
-	struct irq_cfg *cfg = irq_cfg(irq);
-
-	free_remapped_irq(irq);
-	clear_irq_vector(irq, cfg);
-	free_irq_cfg(irq, cfg);
-}
-
 static void __init print_APIC_field(int base)
 {
 	int i;
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -61,6 +61,8 @@ void foo(void)
 
 #ifdef CONFIG_XEN
 	BLANK();
+	OFFSET(TI_cpu, thread_info, cpu);
+	BLANK();
 	OFFSET(XEN_START_mfn_list, start_info, mfn_list);
 #endif
 
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -296,7 +296,7 @@ static int nearby_node(int apicid)
  *     Assumption: Number of cores in each internal node is the same.
  * (2) AMD processors supporting compute units
  */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 static void amd_get_topology(struct cpuinfo_x86 *c)
 {
 	u32 cores_per_cu = 1;
@@ -349,7 +349,7 @@ static void amd_get_topology(struct cpui
  */
 static void amd_detect_cmp(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 	unsigned bits;
 	int cpu = smp_processor_id();
 
@@ -434,7 +434,7 @@ static void srat_detect_node(struct cpui
 
 static void early_init_amd_mc(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 	unsigned bits, ecx;
 
 	/* Multi core CPU? */
--- a/arch/x86/kernel/cpu/common-xen.c
+++ b/arch/x86/kernel/cpu/common-xen.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/string.h>
+#include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/init.h>
@@ -31,8 +32,7 @@
 #include <asm/setup.h>
 #include <asm/apic.h>
 #include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/mtrr.h>
 #include <linux/numa.h>
 #include <asm/asm.h>
@@ -162,32 +162,21 @@ EXPORT_PER_CPU_SYMBOL(xen_x86_cr0);
 EXPORT_PER_CPU_SYMBOL(xen_x86_cr0_upd);
 #endif
 
-static int __init x86_xsave_setup(char *s)
+static int __init x86_mpx_setup(char *s)
 {
-	if (s && *s)
+	/* require an exact match without trailing characters */
+	if (strlen(s))
 		return 0;
-	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-	setup_clear_cpu_cap(X86_FEATURE_AVX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX2);
-	return 1;
-}
-__setup("noxsave", x86_xsave_setup);
 
-static int __init x86_xsaveopt_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	return 1;
-}
-__setup("noxsaveopt", x86_xsaveopt_setup);
+	/* do not emit a message if the feature is not present */
+	if (!boot_cpu_has(X86_FEATURE_MPX))
+		return 1;
 
-static int __init x86_xsaves_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+	setup_clear_cpu_cap(X86_FEATURE_MPX);
+	pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n");
 	return 1;
 }
-__setup("noxsaves", x86_xsaves_setup);
+__setup("nompx", x86_mpx_setup);
 
 #ifdef CONFIG_X86_32
 static int cachesize_override = -1;
@@ -199,14 +188,6 @@ static int __init cachesize_setup(char *
 }
 __setup("cachesize=", cachesize_setup);
 
-static int __init x86_fxsr_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FXSR);
-	setup_clear_cpu_cap(X86_FEATURE_XMM);
-	return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
-
 static int __init x86_sep_setup(char *s)
 {
 	setup_clear_cpu_cap(X86_FEATURE_SEP);
@@ -461,7 +442,7 @@ static const struct cpu_dev *cpu_devs[X8
 static void get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
-	char *p, *q;
+	char *p, *q, *s;
 
 	if (c->extended_cpuid_level < 0x80000004)
 		return;
@@ -472,19 +453,21 @@ static void get_model_name(struct cpuinf
 	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
 	c->x86_model_id[48] = 0;
 
-	/*
-	 * Intel chips right-justify this string for some dumb reason;
-	 * undo that brain damage:
-	 */
-	p = q = &c->x86_model_id[0];
+	/* Trim whitespace */
+	p = q = s = &c->x86_model_id[0];
+
 	while (*p == ' ')
 		p++;
-	if (p != q) {
-		while (*p)
-			*q++ = *p++;
-		while (q <= &c->x86_model_id[48])
-			*q++ = '\0';	/* Zero-pad the rest */
+
+	while (*p) {
+		/* Note the last non-whitespace index */
+		if (!isspace(*p))
+			s = q;
+
+		*q++ = *p++;
 	}
+
+	*(s + 1) = '\0';
 }
 
 void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
@@ -550,7 +533,7 @@ static void cpu_detect_tlb(struct cpuinf
 
 void detect_ht(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_X86_HT
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 	u32 eax, ebx, ecx, edx;
 	int index_msb, core_bits;
 	static bool printed;
@@ -804,11 +787,6 @@ static void __init early_identify_cpu(st
 	cpu_detect(c);
 	get_cpu_vendor(c);
 	get_cpu_cap(c);
-	fpu_detect(c);
-#ifdef CONFIG_XEN
-	if (!cpu_has_xsave)
-		x86_xsave_setup(NULL);
-#endif
 
 	if (this_cpu->c_early_init)
 		this_cpu->c_early_init(c);
@@ -820,6 +798,7 @@ static void __init early_identify_cpu(st
 		this_cpu->c_bsp_init(c);
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+	fpu__init_system(c);
 }
 
 void __init early_cpu_init(void)
@@ -894,7 +873,7 @@ static void generic_identify(struct cpui
 	if (c->cpuid_level >= 0x00000001) {
 		c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
 #ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
+# ifdef CONFIG_SMP
 		c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
 # else
 		c->apicid = c->initial_apicid;
@@ -1060,19 +1039,19 @@ static void identify_cpu(struct cpuinfo_
 #ifdef CONFIG_X86_32
 void enable_sep_cpu(void)
 {
-	extern asmlinkage void ia32pv_sysenter_target(void);
+	extern asmlinkage void entry_SYSENTER_PV32(void);
 	static struct callback_register sysenter = {
 		.type = CALLBACKTYPE_sysenter,
-		.address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
+		.address = { __KERNEL_CS, (unsigned long)entry_SYSENTER_PV32 },
 	};
 
 # ifdef TIF_CSTAR
 	if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
-		extern asmlinkage void ia32pv_cstar_target(void);
+		extern asmlinkage void entry_SYSCALL_PV32(void);
 		static const struct callback_register cstar = {
 			.type = CALLBACKTYPE_syscall32,
 			.address = { __KERNEL_CS,
-			             (unsigned long)ia32pv_cstar_target },
+			             (unsigned long)entry_SYSCALL_PV32 },
 		};
 
 		if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
@@ -1085,7 +1064,7 @@ void enable_sep_cpu(void)
 		return;
 
 	if (xen_feature(XENFEAT_supervisor_mode_kernel))
-		sysenter.address.eip = (unsigned long)ia32_sysenter_target;
+		sysenter.address.eip = (unsigned long)entry_SYSENTER_32;
 
 	switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
 	case 0:
@@ -1196,7 +1175,7 @@ void print_cpu_info(struct cpuinfo_x86 *
 		printk(KERN_CONT "%s ", vendor);
 
 	if (c->x86_model_id[0])
-		printk(KERN_CONT "%s", strim(c->x86_model_id));
+		printk(KERN_CONT "%s", c->x86_model_id);
 	else
 		printk(KERN_CONT "%d86", c->x86);
 
@@ -1229,10 +1208,6 @@ static __init int setup_disablecpuid(cha
 }
 __setup("clearcpuid=", setup_disablecpuid);
 
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
-	(unsigned long)&init_thread_union + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
-
 #ifdef CONFIG_X86_64
 #ifndef CONFIG_X86_NO_IDT
 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
@@ -1267,8 +1242,6 @@ DEFINE_PER_CPU(unsigned int, irq_count)
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
-
 #ifndef CONFIG_X86_NO_TSS
 /*
  * Special IST stacks which the CPU switches to when it calls
@@ -1294,10 +1267,10 @@ void syscall_init(void)
 	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
 	 */
 	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
-	wrmsrl(MSR_LSTAR, system_call);
+	wrmsrl(MSR_LSTAR, entry_SYSCALL_64);
 
 #ifdef CONFIG_IA32_EMULATION
-	wrmsrl(MSR_CSTAR, ia32_cstar_target);
+	wrmsrl(MSR_CSTAR, entry_SYSCALL_compat);
 	/*
 	 * This only works on Intel CPUs.
 	 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1306,7 +1279,7 @@ void syscall_init(void)
 	 */
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
-	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
 	wrmsrl(MSR_CSTAR, ignore_sysret);
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
@@ -1322,11 +1295,11 @@ void syscall_init(void)
 #ifdef CONFIG_IA32_EMULATION
 	static const struct callback_register cstar = {
 		.type = CALLBACKTYPE_syscall32,
-		.address = (unsigned long)ia32_cstar_target
+		.address = (unsigned long)entry_SYSCALL_compat
 	};
 	static const struct callback_register sysenter = {
 		.type = CALLBACKTYPE_sysenter,
-		.address = (unsigned long)ia32_sysenter_target
+		.address = (unsigned long)entry_SYSENTER_compat
 	};
 
 	if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
@@ -1393,7 +1366,6 @@ DEFINE_PER_CPU(struct task_struct *, cur
 EXPORT_PER_CPU_SYMBOL(current_task);
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
 
 /*
  * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
@@ -1571,12 +1543,12 @@ void cpu_init(void)
 	set_tss_desc(cpu, t);
 	load_TR_desc();
 #endif
-	load_LDT(&init_mm.context);
+	load_mm_ldt(&init_mm);
 
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
 
-	fpu_init();
+	fpu__init_cpu();
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	if (is_uv_system())
@@ -1622,7 +1594,7 @@ void cpu_init(void)
 
 	load_sp0(t, thread);
 
-	load_LDT(&init_mm.context);
+	load_mm_ldt(&init_mm);
 
 #ifndef CONFIG_X86_NO_TSS
 	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
@@ -1636,7 +1608,7 @@ void cpu_init(void)
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
 
-	fpu_init();
+	fpu__init_cpu();
 }
 #endif
 
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -654,7 +654,7 @@ unsigned int init_intel_cacheinfo(struct
 	unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
 	unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
 	unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 	unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
 	unsigned int cpu = c->cpu_index;
 #endif
@@ -689,7 +689,7 @@ unsigned int init_intel_cacheinfo(struct
 				break;
 			case 2:
 				new_l2 = this_leaf.size/1024;
-#ifdef CONFIG_X86_HT
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 				num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
 				index_msb = get_count_order(num_threads_sharing);
 				l2_id = c->apicid & ~((1 << index_msb) - 1);
@@ -697,7 +697,7 @@ unsigned int init_intel_cacheinfo(struct
 				break;
 			case 3:
 				new_l3 = this_leaf.size/1024;
-#ifdef CONFIG_X86_HT
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 				num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
 				index_msb = get_count_order(num_threads_sharing);
 				l3_id = c->apicid & ~((1 << index_msb) - 1);
@@ -778,19 +778,19 @@ unsigned int init_intel_cacheinfo(struct
 
 	if (new_l2) {
 		l2 = new_l2;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 		per_cpu(cpu_llc_id, cpu) = l2_id;
 #endif
 	}
 
 	if (new_l3) {
 		l3 = new_l3;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 		per_cpu(cpu_llc_id, cpu) = l3_id;
 #endif
 	}
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 	/*
 	 * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
 	 * turns means that the only possibility is SMT (as indicated in
--- a/arch/x86/kernel/cpu/microcode/core-xen.c
+++ b/arch/x86/kernel/cpu/microcode/core-xen.c
@@ -1,25 +1,16 @@
 /*
- *	CPU Microcode Update Driver for Linux on Xen
+ * CPU Microcode Update Driver for Linux on Xen
  *
- *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *		      2006	Shaohua Li <shaohua.li@intel.com>
- *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
- *
- *	http://developer.intel.com/Assets/PDF/manual/253668.pdf
- *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *	      2006	Shaohua Li <shaohua.li@intel.com>
+ *	      2013-2015	Borislav Petkov <bp@alien8.de>
+ *
+ * This driver allows to upgrade microcode on x86 processors.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
--- a/arch/x86/kernel/cpu/mtrr/main-xen.c
+++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
@@ -36,6 +36,13 @@ const struct mtrr_ops generic_mtrr_ops =
 
 const struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
 unsigned int num_var_ranges;
+static bool __mtrr_enabled;
+
+static bool mtrr_enabled(void)
+{
+	return __mtrr_enabled;
+}
+
 unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 
 static u64 tom2;
@@ -67,6 +74,9 @@ int mtrr_add_page(unsigned long base, un
 	int error;
 	struct xen_platform_op op;
 
+	if (!mtrr_enabled())
+		return -ENXIO;
+
 	mutex_lock(&mtrr_mutex);
 
 	op.cmd = XENPF_add_memtype;
@@ -102,6 +112,8 @@ static int mtrr_check(unsigned long base
 int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
 	     bool increment)
 {
+	if (!mtrr_enabled())
+		return -ENODEV;
 	if (mtrr_check(base, size))
 		return -EINVAL;
 	return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
@@ -117,6 +129,9 @@ int mtrr_del_page(int reg, unsigned long
 	int error = -EINVAL;
 	struct xen_platform_op op;
 
+	if (!mtrr_enabled())
+		return -ENODEV;
+
 	mutex_lock(&mtrr_mutex);
 
 	if (reg < 0) {
@@ -156,6 +171,8 @@ int mtrr_del_page(int reg, unsigned long
 
 int mtrr_del(int reg, unsigned long base, unsigned long size)
 {
+	if (!mtrr_enabled())
+		return -ENODEV;
 	if (mtrr_check(base, size))
 		return -EINVAL;
 	return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
@@ -171,6 +188,9 @@ EXPORT_SYMBOL(mtrr_del);
  * attempts to add a WC MTRR covering size bytes starting at base and
  * logs an error if this fails.
  *
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
  * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
  * but drivers should not try to interpret that return value.
  */
@@ -178,7 +198,7 @@ int arch_phys_wc_add(unsigned long base,
 {
 	int ret;
 
-	if (pat_enabled)
+	if (pat_enabled() || !mtrr_enabled())
 		return 0;  /* Success!  (We don't need to do anything.) */
 
 	ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
@@ -210,7 +230,7 @@ void arch_phys_wc_del(int handle)
 EXPORT_SYMBOL(arch_phys_wc_del);
 
 /*
- * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
  * @handle: Return value from arch_phys_wc_add
  *
  * This will turn the return value from arch_phys_wc_add into an mtrr
@@ -220,33 +240,40 @@ EXPORT_SYMBOL(arch_phys_wc_del);
  * in printk line.  Alas there is an illegitimate use in some ancient
  * drm ioctls.
  */
-int phys_wc_to_mtrr_index(int handle)
+int arch_phys_wc_index(int handle)
 {
 	if (handle < MTRR_TO_PHYS_WC_OFFSET)
 		return -1;
 	else
 		return handle - MTRR_TO_PHYS_WC_OFFSET;
 }
-EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
 
-/*
- * Returns the effective MTRR type for the region
- * Error returns:
- * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
- * - 0xFF - when MTRR is not enabled
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * Return Values:
+ * MTRR_TYPE_(type)  - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ *
+ * Output Argument:
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ *	     region is fully covered by a single MTRR entry or the default
+ *	     type.
  */
-u8 mtrr_type_lookup(u64 start, u64 end)
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
 {
 	int i, error;
 	u64 start_mfn, end_mfn, base_mfn, top_mfn;
 	u8 prev_match, curr_match;
 	struct xen_platform_op op;
 
+	*uniform = 1;
 	if (!is_initial_xendomain())
 		return MTRR_TYPE_WRBACK;
 
 	if (!num_var_ranges)
-		return 0xFF;
+		return MTRR_TYPE_INVALID;
 
 	start_mfn = start >> PAGE_SHIFT;
 	/* Make end inclusive end, instead of exclusive */
@@ -261,6 +288,7 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 		if (!error)
 			return op.u.read_memtype.type;
 #endif
+		*uniform = 0;
 		return MTRR_TYPE_UNCACHABLE;
 	}
 
@@ -269,7 +297,7 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 	 * Look of multiple ranges matching this address and pick type
 	 * as per MTRR precedence
 	 */
-	prev_match = 0xFF;
+	prev_match = MTRR_TYPE_INVALID;
 	for (i = 0; i < num_var_ranges; ++i) {
 		op.cmd = XENPF_read_memtype;
 		op.u.read_memtype.reg = i;
@@ -285,12 +313,11 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 			continue;
 		}
 
-		if (base_mfn > start_mfn || end_mfn > top_mfn) {
-			return 0xFE;
-		}
+		if (base_mfn > start_mfn || end_mfn > top_mfn)
+			*uniform = 0;
 
 		curr_match = op.u.read_memtype.type;
-		if (prev_match == 0xFF) {
+		if (prev_match == MTRR_TYPE_INVALID) {
 			prev_match = curr_match;
 			continue;
 		}
@@ -318,7 +345,7 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 			return MTRR_TYPE_WRBACK;
 	}
 
-	if (prev_match != 0xFF)
+	if (prev_match != MTRR_TYPE_INVALID)
 		return prev_match;
 
 #if 0//todo
@@ -369,6 +396,9 @@ void __init mtrr_bp_init(void)
 		rdmsrl(MSR_K8_TOP_MEM2, tom2);
 		tom2 &= 0xffffff8000000ULL;
 	}
+
+	if (!mtrr_enabled())
+		pr_info("MTRR: Disabled\n");
 }
 
 void mtrr_ap_init(void)
--- a/arch/x86/kernel/e820-xen.c
+++ b/arch/x86/kernel/e820-xen.c
@@ -172,6 +172,7 @@ static void __init e820_print_type(u32 t
 	case E820_UNUSABLE:
 		printk(KERN_CONT "unusable");
 		break;
+	case E820_PMEM:
 	case E820_PRAM:
 		printk(KERN_CONT "persistent (type %u)", type);
 		break;
@@ -1030,11 +1031,32 @@ static inline const char *e820_type_to_s
 	case E820_ACPI:	return "ACPI Tables";
 	case E820_NVS:	return "ACPI Non-volatile Storage";
 	case E820_UNUSABLE:	return "Unusable memory";
-	case E820_PRAM: return "Persistent RAM";
+	case E820_PRAM: return "Persistent Memory (legacy)";
+	case E820_PMEM: return "Persistent Memory";
 	default:	return "reserved";
 	}
 }
 
+static bool do_mark_busy(u32 type, struct resource *res)
+{
+	/* this is the legacy bios/dos rom-shadow + mmio region */
+	if (res->start < (1ULL<<20))
+		return true;
+
+	/*
+	 * Treat persistent memory like device memory, i.e. reserve it
+	 * for exclusive use of a driver
+	 */
+	switch (type) {
+	case E820_RESERVED:
+	case E820_PRAM:
+	case E820_PMEM:
+		return false;
+	default:
+		return true;
+	}
+}
+
 #ifdef CONFIG_XEN
 #define e820 machine_e820
 #endif
@@ -1068,9 +1090,7 @@ void __init e820_reserve_resources(void)
 		 * pci device BAR resource and insert them later in
 		 * pcibios_resource_survey()
 		 */
-		if (((e820.map[i].type != E820_RESERVED) &&
-		     (e820.map[i].type != E820_PRAM)) ||
-		     res->start < (1ULL<<20)) {
+		if (do_mark_busy(e820.map[i].type, res)) {
 			res->flags |= IORESOURCE_BUSY;
 			insert_resource(&iomem_resource, res);
 		}
@@ -1252,7 +1272,8 @@ void __init memblock_find_dma_reserve(vo
 		nr_pages += end_pfn - start_pfn;
 	}
 
-	for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
+	for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+				NULL) {
 		start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
 		end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
 		if (start_pfn < end_pfn)
--- a/arch/x86/kernel/early_printk-xen.c
+++ b/arch/x86/kernel/early_printk-xen.c
@@ -174,7 +174,9 @@ static __init void early_serial_init(cha
 	}
 
 	if (*s) {
-		if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+		baud = simple_strtoull(s, &e, 0);
+
+		if (baud == 0 || s == e)
 			baud = DEFAULT_BAUD;
 	}
 
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -6,6 +6,8 @@
 
 #include <linux/sched.h>
 
+static int x86_noxsave_setup(char *);
+
 /*
  * Initialize the TS bit in CR0 according to the style of context-switches
  * we are using:
@@ -64,7 +66,7 @@ void fpu__init_cpu(void)
  * Set the X86_FEATURE_FPU CPU-capability bit based on
  * trying to execute an actual sequence of FPU instructions:
  */
-static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+static void __init fpu__init_system_early_generic(struct cpuinfo_x86 *c)
 {
 	unsigned long cr0;
 	u16 fsw, fcw;
@@ -90,6 +92,10 @@ static void fpu__init_system_early_gener
 			asm volatile("hlt");
 	}
 #endif
+#ifdef CONFIG_XEN
+	if (!cpu_has_xsave)
+		x86_noxsave_setup("");
+#endif
 }
 
 /*
--- a/arch/x86/kernel/head-xen.c
+++ b/arch/x86/kernel/head-xen.c
@@ -196,7 +196,7 @@ void __init xen_arch_setup(void)
 #ifdef CONFIG_X86_64
 	static const struct callback_register __initconst syscall = {
 		.type = CALLBACKTYPE_syscall,
-		.address = CALLBACK_ADDR(system_call)
+		.address = CALLBACK_ADDR(entry_SYSCALL_64)
 	};
 #endif
 	static const struct callback_register __initconst nmi_cb = {
--- a/arch/x86/kernel/head64-xen.c
+++ b/arch/x86/kernel/head64-xen.c
@@ -178,11 +178,12 @@ asmlinkage __visible void __init x86_64_
 	/* Kill off the identity-map trampoline */
 	reset_early_page_tables();
 
-	kasan_map_early_shadow(early_level4_pgt);
-
-	/* clear bss before set_intr_gate with early_idt_handler */
 	clear_bss();
 
+	clear_page(init_level4_pgt);
+
+	kasan_early_init();
+
 	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
 		set_intr_gate(i, early_idt_handler_array[i]);
 	load_idt((const struct desc_ptr *)&idt_descr);
@@ -196,15 +197,12 @@ asmlinkage __visible void __init x86_64_
 #endif
 
 #ifndef CONFIG_XEN
-	clear_page(init_level4_pgt);
 	/* set init_level4_pgt kernel high mapping*/
 	init_level4_pgt[511] = early_level4_pgt[511];
 
-	kasan_map_early_shadow(init_level4_pgt);
 #else
 	xen_switch_pt();
 #endif
-
 	x86_64_start_reservations(real_mode_data);
 }
 
--- a/arch/x86/kernel/head_32-xen.S
+++ b/arch/x86/kernel/head_32-xen.S
@@ -12,7 +12,6 @@
 #include <asm/thread_info.h>
 #include <asm/asm-offsets.h>
 #include <asm/boot.h>
-#include <asm/dwarf2.h>
 #include <asm/percpu.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/arch-x86/xen-mca.h>
@@ -119,15 +118,7 @@ ENTRY(startup_32)
 	.balign PAGE_SIZE
 #endif
 ENTRY(hypercall_page)
-	CFI_STARTPROC
-	.skip __HYPERVISOR_iret * 32
-	CFI_REMEMBER_STATE
-	.skip 1 /* push %eax */
-	CFI_ADJUST_CFA_OFFSET	8
-	CFI_REL_OFFSET	eax,0
-	CFI_RESTORE_STATE
-	.balign 0x1000,0
-	CFI_ENDPROC
+	.fill PAGE_SIZE, 1, 0xcc
 
 #define HYPERCALL(n) \
 	.equ HYPERVISOR_##n, hypercall_page + __HYPERVISOR_##n * 32; \
--- a/arch/x86/kernel/head_64-xen.S
+++ b/arch/x86/kernel/head_64-xen.S
@@ -19,7 +19,6 @@
 #include <asm/page.h>
 #include <asm/msr.h>
 #include <asm/cache.h>
-#include <asm/dwarf2.h>
 #include <asm/percpu.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/arch-x86/xen-mca.h>
@@ -95,39 +94,7 @@ NEXT_PAGE(hypercall_page)
 #if CONFIG_XEN_COMPAT <= 0x030002
 	phys_hypercall_page = . - .head.text
 #endif
-	CFI_STARTPROC
-	i = 0
-	.rept 0x1000 / 0x20
-	.skip 1 /* push %rcx */
-	CFI_ADJUST_CFA_OFFSET	8
-	CFI_REL_OFFSET	rcx,0
-	.skip 2 /* push %r11 */
-	CFI_ADJUST_CFA_OFFSET	8
-	CFI_REL_OFFSET	r11,0
-	.if i == __HYPERVISOR_iret
-	.skip 1 /* push %rax */
-	CFI_ADJUST_CFA_OFFSET	8
-	CFI_REL_OFFSET	rax,0
-	.endif
-	.skip 5 /* mov $#,%eax */
-	.skip 2 /* syscall */
-	.if i == __HYPERVISOR_iret
-	CFI_ADJUST_CFA_OFFSET	-3*8
-	CFI_SAME_VALUE	rax
-	CFI_SAME_VALUE	r11
-	CFI_SAME_VALUE	rcx
-	.else
-	.skip 2 /* pop %r11 */
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_RESTORE r11
-	.skip 1 /* pop %rcx */
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_RESTORE rcx
-	.endif
-	.balign 0x20,0 /* ret */
-	i = i + 1
-	.endr
-	CFI_ENDPROC
+	.fill PAGE_SIZE, 1, 0xcc
 
 #define HYPERCALL(n) \
 	.equ HYPERVISOR_##n, hypercall_page + __HYPERVISOR_##n * 32; \
--- a/arch/x86/kernel/irq-xen.c
+++ b/arch/x86/kernel/irq-xen.c
@@ -22,6 +22,12 @@
 #define CREATE_TRACE_POINTS
 #include <asm/trace/irq_vectors.h>
 
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
 atomic_t irq_err_count;
 
 #ifndef CONFIG_XEN
@@ -130,6 +136,12 @@ int arch_show_interrupts(struct seq_file
 		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
 	seq_puts(p, "  Threshold APIC interrupts\n");
 #endif
+#ifdef CONFIG_X86_MCE_AMD
+	seq_printf(p, "%*s: ", prec, "DFR");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count);
+	seq_puts(p, "  Deferred Error APIC interrupts\n");
+#endif
 #ifdef CONFIG_X86_MCE
 	seq_printf(p, "%*s: ", prec, "MCE");
 	for_each_online_cpu(j)
@@ -150,6 +162,18 @@ int arch_show_interrupts(struct seq_file
 #if defined(CONFIG_X86_IO_APIC)
 	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
 #endif
+#ifdef CONFIG_HAVE_KVM
+	seq_printf(p, "%*s: ", prec, "PIN");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
+	seq_puts(p, "  Posted-interrupt notification event\n");
+
+	seq_printf(p, "%*s: ", prec, "PIW");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ",
+			   irq_stats(j)->kvm_posted_intr_wakeup_ipis);
+	seq_puts(p, "  Posted-interrupt wakeup event\n");
+#endif
 	return 0;
 }
 
@@ -211,8 +235,7 @@ __visible unsigned int __irq_entry do_IR
 	unsigned vector = ~regs->orig_ax;
 	unsigned irq;
 
-	irq_enter();
-	exit_idle();
+	entering_irq();
 
 	irq = __this_cpu_read(vector_irq[vector]);
 
@@ -228,7 +251,7 @@ __visible unsigned int __irq_entry do_IR
 		}
 	}
 
-	irq_exit();
+	exiting_irq();
 
 	set_irq_regs(old_regs);
 	return 1;
@@ -256,6 +279,18 @@ __visible void smp_x86_platform_ipi(stru
 }
 
 #ifdef CONFIG_HAVE_KVM
+static void dummy_handler(void) {}
+static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
+
+void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
+{
+	if (handler)
+		kvm_posted_intr_wakeup_handler = handler;
+	else
+		kvm_posted_intr_wakeup_handler = dummy_handler;
+}
+EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+
 /*
  * Handler for POSTED_INTERRUPT_VECTOR.
  */
@@ -263,16 +298,23 @@ __visible void smp_kvm_posted_intr_ipi(s
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
-	ack_APIC_irq();
-
-	irq_enter();
-
-	exit_idle();
-
+	entering_ack_irq();
 	inc_irq_stat(kvm_posted_intr_ipis);
+	exiting_irq();
+	set_irq_regs(old_regs);
+}
 
-	irq_exit();
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
 
+	entering_ack_irq();
+	inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+	kvm_posted_intr_wakeup_handler();
+	exiting_irq();
 	set_irq_regs(old_regs);
 }
 #endif
@@ -327,14 +369,22 @@ int check_irq_vectors_for_cpu_disable(vo
 			if (!desc)
 				continue;
 
+			/*
+			 * Protect against concurrent action removal,
+			 * affinity changes etc.
+			 */
+			raw_spin_lock(&desc->lock);
 			data = irq_desc_get_irq_data(desc);
 			cpumask_copy(&affinity_new, data->affinity);
 			cpumask_clear_cpu(this_cpu, &affinity_new);
 
 			/* Do not count inactive or per-cpu irqs. */
-			if (!irq_has_action(irq) || irqd_is_per_cpu(data))
+			if (!irq_has_action(irq) || irqd_is_per_cpu(data)) {
+				raw_spin_unlock(&desc->lock);
 				continue;
+			}
 
+			raw_spin_unlock(&desc->lock);
 			/*
 			 * A single irq may be mapped to multiple
 			 * cpu's vector_irq[] (for example IOAPIC cluster
@@ -365,6 +415,9 @@ int check_irq_vectors_for_cpu_disable(vo
 		 * vector. If the vector is marked in the used vectors
 		 * bitmap or an irq is assigned to it, we don't count
 		 * it as available.
+		 *
+		 * As this is an inaccurate snapshot anyway, we can do
+		 * this w/o holding vector_lock.
 		 */
 		for (vector = FIRST_EXTERNAL_VECTOR;
 		     vector < first_system_vector; vector++) {
@@ -462,15 +515,20 @@ void fixup_irqs(void)
 	 */
 	mdelay(1);
 
+	/*
+	 * We can walk the vector array of this cpu without holding
+	 * vector_lock because the cpu is already marked !online, so
+	 * nothing else will touch it.
+	 */
 	for_each_irq_desc(irq, desc) {
 		if (!__test_and_clear_bit(irq, irqs_used))
 			continue;
 
 		if (xen_test_irq_pending(irq)) {
 			desc = irq_to_desc(irq);
+			raw_spin_lock(&desc->lock);
 			data = irq_desc_get_irq_data(desc);
 			chip = irq_data_get_irq_chip(data);
-			raw_spin_lock(&desc->lock);
 			if (chip->irq_retrigger)
 				chip->irq_retrigger(data);
 			raw_spin_unlock(&desc->lock);
--- a/arch/x86/kernel/ldt-xen.c
+++ b/arch/x86/kernel/ldt-xen.c
@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
 
@@ -20,85 +21,83 @@
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
 
-#ifdef CONFIG_SMP
+/* context.lock is held for us, so we don't need any locking. */
 static void flush_ldt(void *current_mm)
 {
-	if (current->active_mm == current_mm)
-		load_LDT(&current->active_mm->context);
+	mm_context_t *pc;
+
+	if (current->active_mm != current_mm)
+		return;
+
+	pc = &current->active_mm->context;
+	set_ldt(pc->ldt->entries, pc->ldt->size);
 }
-#endif
 
-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+static struct ldt_struct *alloc_ldt_struct(int size)
 {
-	void *oldldt, *newldt;
-	int oldsize;
+	struct ldt_struct *new_ldt;
+	int alloc_size;
 
-	if (mincount <= pc->size)
-		return 0;
-	oldsize = pc->size;
-	mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
-			(~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
-	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
-		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
-	else
-		newldt = (void *)__get_free_page(GFP_KERNEL);
+	if (size > LDT_ENTRIES)
+		return NULL;
 
-	if (!newldt)
-		return -ENOMEM;
+	new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
+	if (!new_ldt)
+		return NULL;
+
+	BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
+	alloc_size = size * LDT_ENTRY_SIZE;
+
+	/*
+	 * Xen is very picky: it requires a page-aligned LDT that has no
+	 * trailing nonzero bytes in any page that contains LDT descriptors.
+	 * Keep it simple: zero the whole allocation and never allocate less
+	 * than PAGE_SIZE.
+	 */
+	if (alloc_size > PAGE_SIZE)
+		new_ldt->entries = vzalloc(alloc_size);
+	else
+		new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
 
-	if (oldsize)
-		memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
-	oldldt = pc->ldt;
-	memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
-	       (mincount - oldsize) * LDT_ENTRY_SIZE);
-
-#ifdef CONFIG_X86_64
-	/* CHECKME: Do we really need this ? */
-	wmb();
-#endif
-	pc->ldt = newldt;
-	wmb();
-	pc->size = mincount;
-	wmb();
-
-	if (reload) {
-#ifdef CONFIG_SMP
-		preempt_disable();
-#endif
-		make_pages_readonly(newldt,
-				    (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
-				    XENFEAT_writable_descriptor_tables);
-		load_LDT(pc);
-#ifdef CONFIG_SMP
-		if (!cpumask_equal(mm_cpumask(current->mm),
-				   cpumask_of(smp_processor_id())))
-			smp_call_function(flush_ldt, current->mm, 1);
-		preempt_enable();
-#endif
+	if (!new_ldt->entries) {
+		kfree(new_ldt);
+		return NULL;
 	}
-	if (oldsize) {
-		make_pages_writable(oldldt,
-				    (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
-				    XENFEAT_writable_descriptor_tables);
-		if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
-			vfree(oldldt);
-		else
-			put_page(virt_to_page(oldldt));
-	}
-	return 0;
-}
-
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
-{
-	int err = alloc_ldt(new, old->size, 0);
-
-	if (err < 0)
-		return err;
-	memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
-	make_pages_readonly(new->ldt,
-			    (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+	new_ldt->size = size;
+	return new_ldt;
+}
+
+/* After calling this, the LDT is immutable. */
+static void finalize_ldt_struct(struct ldt_struct *ldt)
+{
+	make_pages_readonly(ldt->entries, PFN_DOWN(ldt->size * LDT_ENTRY_SIZE),
+			    XENFEAT_writable_descriptor_tables);
+}
+
+/* context.lock is held */
+static void install_ldt(struct mm_struct *current_mm,
+			struct ldt_struct *ldt)
+{
+	/* Synchronizes with lockless_dereference in load_mm_ldt. */
+	smp_store_release(&current_mm->context.ldt, ldt);
+
+	/* Activate the LDT for all CPUs using current_mm. */
+	on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
+}
+
+static void free_ldt_struct(struct ldt_struct *ldt)
+{
+	if (likely(!ldt))
+		return;
+
+	make_pages_writable(ldt->entries, PFN_DOWN(ldt->size * LDT_ENTRY_SIZE),
 			    XENFEAT_writable_descriptor_tables);
-	return 0;
+	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+		vfree(ldt->entries);
+	else
+		free_page((unsigned long)ldt->entries);
+	kfree(ldt);
 }
 
 /*
@@ -107,19 +106,35 @@ static inline int copy_ldt(mm_context_t
  */
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
+	struct ldt_struct *new_ldt;
 	struct mm_struct *old_mm;
 	int retval = 0;
 
 	memset(&mm->context, 0, sizeof(mm->context));
 	mutex_init(&mm->context.lock);
 	old_mm = current->mm;
-	if (old_mm)
-		mm->context.vdso = old_mm->context.vdso;
-	if (old_mm && old_mm->context.size > 0) {
-		mutex_lock(&old_mm->context.lock);
-		retval = copy_ldt(&mm->context, &old_mm->context);
-		mutex_unlock(&old_mm->context.lock);
+	if (!old_mm)
+		return 0;
+	mm->context.vdso = old_mm->context.vdso;
+
+	mutex_lock(&old_mm->context.lock);
+	if (!old_mm->context.ldt)
+		goto out_unlock;
+
+	new_ldt = alloc_ldt_struct(old_mm->context.ldt->size);
+	if (!new_ldt) {
+		retval = -ENOMEM;
+		goto out_unlock;
 	}
+
+	memcpy(new_ldt->entries, old_mm->context.ldt->entries,
+	       new_ldt->size * LDT_ENTRY_SIZE);
+	finalize_ldt_struct(new_ldt);
+
+	mm->context.ldt = new_ldt;
+
+out_unlock:
+	mutex_unlock(&old_mm->context.lock);
 	return retval;
 }
 
@@ -130,53 +145,47 @@ int init_new_context(struct task_struct
  */
 void destroy_context(struct mm_struct *mm)
 {
-	if (mm->context.size) {
-		/* CHECKME: Can this ever happen ? */
-		if (mm == current->active_mm)
-			clear_LDT();
-		make_pages_writable(mm->context.ldt,
-				    (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
-				    XENFEAT_writable_descriptor_tables);
-		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
-			vfree(mm->context.ldt);
-		else
-			put_page(virt_to_page(mm->context.ldt));
-		mm->context.size = 0;
-	}
+	free_ldt_struct(mm->context.ldt);
+	mm->context.ldt = NULL;
 }
 
 static int read_ldt(void __user *ptr, unsigned long bytecount)
 {
-	int err;
+	int retval;
 	unsigned long size;
 	struct mm_struct *mm = current->mm;
 
-	if (!mm->context.size)
-		return 0;
+	mutex_lock(&mm->context.lock);
+
+	if (!mm->context.ldt) {
+		retval = 0;
+		goto out_unlock;
+	}
+
 	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
 		bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
 
-	mutex_lock(&mm->context.lock);
-	size = mm->context.size * LDT_ENTRY_SIZE;
+	size = mm->context.ldt->size * LDT_ENTRY_SIZE;
 	if (size > bytecount)
 		size = bytecount;
 
-	err = 0;
-	if (copy_to_user(ptr, mm->context.ldt, size))
-		err = -EFAULT;
-	mutex_unlock(&mm->context.lock);
-	if (err < 0)
-		goto error_return;
+	if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
+		retval = -EFAULT;
+		goto out_unlock;
+	}
+
 	if (size != bytecount) {
-		/* zero-fill the rest */
-		if (clear_user(ptr + size, bytecount - size) != 0) {
-			err = -EFAULT;
-			goto error_return;
+		/* Zero-fill the rest and pretend we read bytecount bytes. */
+		if (clear_user(ptr + size, bytecount - size)) {
+			retval = -EFAULT;
+			goto out_unlock;
 		}
 	}
-	return bytecount;
-error_return:
-	return err;
+	retval = bytecount;
+
+out_unlock:
+	mutex_unlock(&mm->context.lock);
+	return retval;
 }
 
 static int read_default_ldt(void __user *ptr, unsigned long bytecount)
@@ -200,6 +209,8 @@ static int write_ldt(void __user *ptr, u
 	struct desc_struct ldt;
 	int error;
 	struct user_desc ldt_info;
+	int oldsize, newsize;
+	struct ldt_struct *new_ldt, *old_ldt;
 
 	error = -EINVAL;
 	if (bytecount != sizeof(ldt_info))
@@ -218,34 +229,40 @@ static int write_ldt(void __user *ptr, u
 			goto out;
 	}
 
-	mutex_lock(&mm->context.lock);
-	if (ldt_info.entry_number >= mm->context.size) {
-		error = alloc_ldt(&current->mm->context,
-				  ldt_info.entry_number + 1, 1);
-		if (error < 0)
-			goto out_unlock;
-	}
-
-	/* Allow LDTs to be cleared by the user. */
-	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
-		if (oldmode || LDT_empty(&ldt_info)) {
-			memset(&ldt, 0, sizeof(ldt));
-			goto install;
+	if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
+	    LDT_empty(&ldt_info)) {
+		/* The user wants to clear the entry. */
+		memset(&ldt, 0, sizeof(ldt));
+	} else {
+		if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+			error = -EINVAL;
+			goto out;
 		}
+
+		fill_ldt(&ldt, &ldt_info);
+		if (oldmode)
+			ldt.avl = 0;
 	}
 
-	if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
-		error = -EINVAL;
+	mutex_lock(&mm->context.lock);
+
+	old_ldt = mm->context.ldt;
+	oldsize = old_ldt ? old_ldt->size : 0;
+	newsize = max((int)(ldt_info.entry_number + 1), oldsize);
+
+	error = -ENOMEM;
+	new_ldt = alloc_ldt_struct(newsize);
+	if (!new_ldt)
 		goto out_unlock;
-	}
 
-	fill_ldt(&ldt, &ldt_info);
-	if (oldmode)
-		ldt.avl = 0;
-
-	/* Install the new entry ...  */
-install:
-	error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
+	if (old_ldt)
+		memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE);
+	new_ldt->entries[ldt_info.entry_number] = ldt;
+	finalize_ldt_struct(new_ldt);
+
+	install_ldt(mm, new_ldt);
+	free_ldt_struct(old_ldt);
+	error = 0;
 
 out_unlock:
 	mutex_unlock(&mm->context.lock);
--- a/arch/x86/kernel/mpparse-xen.c
+++ b/arch/x86/kernel/mpparse-xen.c
@@ -19,8 +19,8 @@
 #include <linux/module.h>
 #include <linux/smp.h>
 #include <linux/pci.h>
-#include <linux/irqdomain.h>
 
+#include <asm/irqdomain.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
@@ -125,11 +125,6 @@ static void __init MP_bus_info(struct mp
 		pr_warn("Unknown bustype %s - ignoring\n", str);
 }
 
-static struct irq_domain_ops mp_ioapic_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-};
-
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
 	struct ioapic_domain_cfg cfg = {
--- a/arch/x86/kernel/pci-dma-xen.c
+++ b/arch/x86/kernel/pci-dma-xen.c
@@ -198,6 +198,51 @@ void dma_generic_free_coherent(struct de
 	}
 }
 
+void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		      gfp_t gfp, struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	void *memory;
+
+	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+	if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
+		return memory;
+
+	if (!dev)
+		dev = &x86_dma_fallback_dev;
+
+	if (!is_device_dma_capable(dev))
+		return NULL;
+
+	if (!ops->alloc)
+		return NULL;
+
+	memory = ops->alloc(dev, size, dma_handle,
+			    dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
+	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
+
+	return memory;
+}
+EXPORT_SYMBOL(dma_alloc_attrs);
+
+void dma_free_attrs(struct device *dev, size_t size,
+		    void *vaddr, dma_addr_t bus,
+		    struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	WARN_ON(irqs_disabled());       /* for portability */
+
+	if (dma_release_from_coherent(dev, get_order(size), vaddr))
+		return;
+
+	debug_dma_free_coherent(dev, size, vaddr, bus);
+	if (ops->free)
+		ops->free(dev, size, vaddr, bus, attrs);
+}
+EXPORT_SYMBOL(dma_free_attrs);
+
 /*
  * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
  * parameter documentation.
--- a/arch/x86/kernel/process-xen.c
+++ b/arch/x86/kernel/process-xen.c
@@ -25,8 +25,7 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/mwait.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
@@ -60,6 +59,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(
 #endif
 };
 EXPORT_PER_CPU_SYMBOL(cpu_tss);
+#elif defined(CONFIG_X86_64_XEN)
+__visible DEFINE_PER_CPU(unsigned long, cpu_sp0) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_sp0);
 #endif
 
 #ifdef CONFIG_X86_64
@@ -79,47 +81,15 @@ void idle_notifier_unregister(struct not
 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
 #endif
 
-struct kmem_cache *task_xstate_cachep;
-EXPORT_SYMBOL_GPL(task_xstate_cachep);
-
 /*
  * this gets called so that we can store lazy state into memory and copy the
  * current task into the new thread.
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	*dst = *src;
-
-	dst->thread.fpu_counter = 0;
-	dst->thread.fpu.has_fpu = 0;
-	dst->thread.fpu.state = NULL;
-	task_disable_lazy_fpu_restore(dst);
-	if (tsk_used_math(src)) {
-		int err = fpu_alloc(&dst->thread.fpu);
-		if (err)
-			return err;
-		fpu_copy(dst, src);
-	}
-	return 0;
-}
-
-void free_thread_xstate(struct task_struct *tsk)
-{
-	fpu_free(&tsk->thread.fpu);
-}
-
-void arch_release_task_struct(struct task_struct *tsk)
-{
-	free_thread_xstate(tsk);
-}
+	memcpy(dst, src, arch_task_struct_size);
 
-void arch_task_cache_init(void)
-{
-        task_xstate_cachep =
-        	kmem_cache_create("task_xstate", xstate_size,
-				  __alignof__(union thread_xstate),
-				  SLAB_PANIC | SLAB_NOTRACK, NULL);
-	setup_xstate_comp();
+	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
 
 /*
@@ -130,6 +100,7 @@ void exit_thread(void)
 	struct task_struct *me = current;
 	struct thread_struct *t = &me->thread;
 	unsigned long *bp = t->io_bitmap_ptr;
+	struct fpu *fpu = &t->fpu;
 
 	if (bp) {
 		struct physdev_set_iobitmap set_iobitmap;
@@ -146,7 +117,7 @@ void exit_thread(void)
 		kfree(bp);
 	}
 
-	drop_fpu(me);
+	fpu__drop(fpu);
 }
 
 void flush_thread(void)
@@ -156,19 +127,7 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 
-	if (!use_eager_fpu()) {
-		/* FPU state will be reallocated lazily at the first use. */
-		drop_fpu(tsk);
-		free_thread_xstate(tsk);
-	} else {
-		if (!tsk_used_math(tsk)) {
-			/* kthread execs. TODO: cleanup this horror. */
-			if (WARN_ON(init_fpu(tsk)))
-				force_sig(SIGKILL, tsk);
-			user_fpu_begin();
-		}
-		restore_init_xstate();
-	}
+	fpu__clear(&tsk->thread.fpu);
 }
 
 static void hard_disable_TSC(void)
@@ -437,14 +396,14 @@ static int prefer_mwait_c1_over_halt(con
 }
 
 /*
- * MONITOR/MWAIT with no hints, used for default default C1 state.
- * This invokes MWAIT with interrutps enabled and no flags,
- * which is backwards compatible with the original MWAIT implementation.
+ * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
+ * with interrupts enabled and no flags, which is backwards compatible with the
+ * original MWAIT implementation.
  */
-
 static void mwait_idle(void)
 {
 	if (!current_set_polling_and_test()) {
+		trace_cpu_idle_rcuidle(1, smp_processor_id());
 		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
 			smp_mb(); /* quirk */
 			clflush((void *)&current_thread_info()->flags);
@@ -456,6 +415,7 @@ static void mwait_idle(void)
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
+		trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 	} else {
 		local_irq_enable();
 	}
--- a/arch/x86/kernel/process_32-xen.c
+++ b/arch/x86/kernel/process_32-xen.c
@@ -39,8 +39,7 @@
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/desc.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
@@ -132,8 +131,8 @@ void release_thread(struct task_struct *
 	release_vm86_irqs(dead_task);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp,
-	unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+	unsigned long arg, struct task_struct *p, unsigned long tls)
 {
 	struct pt_regs *childregs = task_pt_regs(p);
 	struct task_struct *tsk;
@@ -192,7 +191,7 @@ int copy_thread(unsigned long clone_flag
 	 */
 	if (clone_flags & CLONE_SETTLS)
 		err = do_set_thread_area(p, -1,
-			(struct user_desc __user *)childregs->si, 0);
+			(struct user_desc __user *)tls, 0);
 
 	p->thread.iopl = current->thread.iopl;
 
@@ -250,12 +249,14 @@ __visible __notrace_funcgraph struct tas
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
-				 *next = &next_p->thread;
+			     *next = &next_p->thread;
+	struct fpu *prev_fpu = &prev->fpu;
+	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id(), cr0_ts;
 #ifndef CONFIG_X86_NO_TSS
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
 #endif
-	fpu_switch_t fpu;
+	fpu_switch_t fpu_switch;
 #if CONFIG_XEN_COMPAT > 0x030002
 	struct physdev_set_iopl iopl_op;
 	struct physdev_set_iobitmap iobmp_op;
@@ -268,7 +269,7 @@ __switch_to(struct task_struct *prev_p,
 
 	/* XEN NOTE: FS/GS saved in switch_mm(), not here. */
 
-	fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl);
+	fpu_switch = xen_switch_fpu_prepare(prev_fpu, next_fpu, cpu, &mcl);
 
 	/*
 	 * Reload sp0.
@@ -364,18 +365,15 @@ __switch_to(struct task_struct *prev_p,
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
 	 * the GDT and LDT are properly updated, and must be
-	 * done before math_state_restore, so the TS bit is up
+	 * done before fpu__restore(), so the TS bit is up
 	 * to date.
 	 */
 	arch_end_context_switch(next_p);
 
 	/*
-	 * Reload kernel_stack and current_top_of_stack.  This changes
+	 * Reload cpu_current_top_of_stack.  This changes
 	 * current_thread_info().
 	 */
-	this_cpu_write(kernel_stack,
-		       (unsigned long)task_stack_page(next_p) +
-		       THREAD_SIZE);
 	this_cpu_write(cpu_current_top_of_stack,
 		       (unsigned long)task_stack_page(next_p) +
 		       THREAD_SIZE);
@@ -386,7 +384,7 @@ __switch_to(struct task_struct *prev_p,
 	if (prev->gs | next->gs)
 		lazy_load_gs(next->gs);
 
-	switch_fpu_finish(next_p, fpu);
+	switch_fpu_finish(next_fpu, fpu_switch);
 
 	this_cpu_write(current_task, next_p);
 
--- a/arch/x86/kernel/process_64-xen.c
+++ b/arch/x86/kernel/process_64-xen.c
@@ -41,8 +41,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/mmu_context.h>
 #include <asm/prctl.h>
 #include <xen/interface/physdev.h>
@@ -131,11 +130,11 @@ EXPORT_SYMBOL(xen_load_gs_index);
 void release_thread(struct task_struct *dead_task)
 {
 	if (dead_task->mm) {
-		if (dead_task->mm->context.size) {
+		if (dead_task->mm->context.ldt) {
 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 				dead_task->comm,
 				dead_task->mm->context.ldt,
-				dead_task->mm->context.size);
+				dead_task->mm->context.ldt->size);
 			BUG();
 		}
 	}
@@ -160,8 +159,8 @@ static inline u32 read_32bit_tls(struct
 	return get_desc_base(&t->thread.tls_array[tls]);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp,
-		unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+		unsigned long arg, struct task_struct *p, unsigned long tls)
 {
 	int err;
 	struct pt_regs *childregs;
@@ -217,10 +216,10 @@ int copy_thread(unsigned long clone_flag
 #ifdef CONFIG_IA32_EMULATION
 		if (is_ia32_task())
 			err = do_set_thread_area(p, -1,
-				(struct user_desc __user *)childregs->si, 0);
+				(struct user_desc __user *)tls, 0);
 		else
 #endif
-			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+			err = do_arch_prctl(p, ARCH_SET_FS, tls);
 		if (err)
 			goto out;
 	}
@@ -285,11 +284,13 @@ __switch_to(struct task_struct *prev_p,
 {
 	struct thread_struct *prev = &prev_p->thread;
 	struct thread_struct *next = &next_p->thread;
+	struct fpu *prev_fpu = &prev->fpu;
+	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id(), cr0_ts;
 #ifndef CONFIG_X86_NO_TSS
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
 #endif
-	fpu_switch_t fpu;
+	fpu_switch_t fpu_switch;
 #if CONFIG_XEN_COMPAT > 0x030002
 	struct physdev_set_iopl iopl_op;
 	struct physdev_set_iobitmap iobmp_op;
@@ -300,7 +301,7 @@ __switch_to(struct task_struct *prev_p,
 #endif
 	multicall_entry_t _mcl[8], *mcl = _mcl;
 
-	fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl);
+	fpu_switch = xen_switch_fpu_prepare(prev_fpu, next_fpu, cpu, &mcl);
 
 	/* Reload sp0. This is load_sp0(tss, next) with a multicall. */
 	mcl->op      = __HYPERVISOR_stack_switch;
@@ -399,7 +400,7 @@ __switch_to(struct task_struct *prev_p,
 	 * Leave lazy mode, flushing any hypercalls made here.  This
 	 * must be done after loading TLS entries in the GDT but before
 	 * loading segments that might reference them, and and it must
-	 * be done before math_state_restore, so the TS bit is up to
+	 * be done before fpu__restore(), so the TS bit is up to
 	 * date.
 	 */
 	arch_end_context_switch(next_p);
@@ -451,7 +452,7 @@ __switch_to(struct task_struct *prev_p,
 	if (next->gs)
 		WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
 
-	switch_fpu_finish(next_p, fpu);
+	switch_fpu_finish(next_fpu, fpu_switch);
 
 	/*
 	 * Switch the PDA context.
@@ -467,7 +468,7 @@ __switch_to(struct task_struct *prev_p,
 	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
 
 	/* This changes current_thread_info(). */
-	this_cpu_write(kernel_stack,
+	this_cpu_write(cpu_sp0,
 		(unsigned long)task_stack_page(next_p) + THREAD_SIZE);
 
 	/*
--- a/arch/x86/kernel/setup-xen.c
+++ b/arch/x86/kernel/setup-xen.c
@@ -528,19 +528,18 @@ static void __init e820_reserve_setup_da
 #ifndef CONFIG_XEN
 	struct setup_data *data;
 	u64 pa_data;
-	int found = 0;
 
 	pa_data = boot_params.hdr.setup_data;
+	if (!pa_data)
+		return;
+
 	while (pa_data) {
 		data = early_memremap(pa_data, sizeof(*data));
 		e820_update_range(pa_data, sizeof(*data)+data->len,
 			 E820_RAM, E820_RESERVED_KERN);
-		found = 1;
 		pa_data = data->next;
 		early_memunmap(data, sizeof(*data));
 	}
-	if (!found)
-		return;
 
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 	memcpy(&e820_saved, &e820, sizeof(struct e820map));
@@ -601,12 +600,14 @@ static void __init reserve_crashkernel_l
 	if (ret != 0) {
 		/*
 		 * two parts from lib/swiotlb.c:
-		 *	swiotlb size: user specified with swiotlb= or default.
-		 *	swiotlb overflow buffer: now is hardcoded to 32k.
-		 *		We round it to 8M for other buffers that
-		 *		may need to stay low too.
+		 * -swiotlb size: user-specified with swiotlb= or default.
+		 *
+		 * -swiotlb overflow buffer: now hardcoded to 32k. We round it
+		 * to 8M for other buffers that may need to stay low too. Also
+		 * make sure we allocate enough extra low memory so that we
+		 * don't run out of DMA buffers for 32-bit devices.
 		 */
-		low_size = swiotlb_size_or_default() + (8UL<<20);
+		low_size = max(swiotlb_size_or_default() + (8UL<<20), 256UL<<20);
 		auto_set = true;
 	} else {
 		/* passed with crashkernel=0,low ? */
@@ -912,7 +913,7 @@ dump_kernel_offset(struct notifier_block
 {
 	if (kaslr_enabled()) {
 		pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
-			 (unsigned long)&_text - __START_KERNEL,
+			 kaslr_offset(),
 			 __START_KERNEL,
 			 __START_KERNEL_map,
 			 MODULES_VADDR-1);
@@ -1248,6 +1249,9 @@ void __init setup_arch(char **cmdline_p)
 	memblock_set_current_limit(ISA_END_ADDRESS);
 	memblock_x86_fill();
 
+	if (efi_enabled(EFI_BOOT))
+		efi_find_mirror();
+
 	/*
 	 * The EFI specification says that boot service code won't be called
 	 * after ExitBootServices(). This is, in fact, a lie.
@@ -1458,8 +1462,7 @@ void __init setup_arch(char **cmdline_p)
 
 #ifndef CONFIG_XEN
 	init_apic_mappings();
-	if (x86_io_apic_ops.init)
-		x86_io_apic_ops.init();
+	io_apic_init_mappings();
 
 	kvm_guest_init();
 
--- a/arch/x86/kernel/traps-xen.c
+++ b/arch/x86/kernel/traps-xen.c
@@ -54,12 +54,13 @@
 #include <asm/ftrace.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/mce.h>
 #include <asm/fixmap.h>
 #include <asm/mach_traps.h>
 #include <asm/alternative.h>
+#include <asm/fpu/xstate.h>
+#include <asm/trace/mpx.h>
 #include <asm/mpx.h>
 
 #ifdef CONFIG_X86_64
@@ -74,8 +75,7 @@ gate_desc debug_idt_table[NR_VECTORS] __
 #else
 #include <asm/processor-flags.h>
 #include <asm/setup.h>
-
-asmlinkage int system_call(void);
+#include <asm/proto.h>
 #endif
 
 #ifndef CONFIG_X86_NO_IDT
@@ -377,10 +377,8 @@ dotraplinkage void do_double_fault(struc
 
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 {
-	struct task_struct *tsk = current;
-	struct xsave_struct *xsave_buf;
 	enum ctx_state prev_state;
-	struct bndcsr *bndcsr;
+	const struct bndcsr *bndcsr;
 	siginfo_t *info;
 
 	prev_state = exception_enter();
@@ -399,15 +397,15 @@ dotraplinkage void do_bounds(struct pt_r
 
 	/*
 	 * We need to look at BNDSTATUS to resolve this exception.
-	 * It is not directly accessible, though, so we need to
-	 * do an xsave and then pull it out of the xsave buffer.
+	 * A NULL here might mean that it is in its 'init state',
+	 * which is all zeros which indicates MPX was not
+	 * responsible for the exception.
 	 */
-	fpu_save_init(&tsk->thread.fpu);
-	xsave_buf = &(tsk->thread.fpu.state->xsave);
-	bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
 	if (!bndcsr)
 		goto exit_trap;
 
+	trace_bounds_exception_mpx(bndcsr);
 	/*
 	 * The error code field of the BNDSTATUS register communicates status
 	 * information of a bound range exception #BR or operation involving
@@ -415,11 +413,11 @@ dotraplinkage void do_bounds(struct pt_r
 	 */
 	switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
 	case 2:	/* Bound directory has invalid entry. */
-		if (mpx_handle_bd_fault(xsave_buf))
+		if (mpx_handle_bd_fault())
 			goto exit_trap;
 		break; /* Success, it was handled */
 	case 1: /* Bound violation. */
-		info = mpx_generate_siginfo(regs, xsave_buf);
+		info = mpx_generate_siginfo(regs);
 		if (IS_ERR(info)) {
 			/*
 			 * We failed to decode the MPX instruction.  Act as if
@@ -715,8 +713,8 @@ NOKPROBE_SYMBOL(do_debug);
 static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
 	struct task_struct *task = current;
+	struct fpu *fpu = &task->thread.fpu;
 	siginfo_t info;
-	unsigned short err;
 	char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
 						"simd exception";
 
@@ -724,8 +722,7 @@ static void math_error(struct pt_regs *r
 		return;
 	conditional_sti(regs);
 
-	if (!user_mode(regs))
-	{
+	if (!user_mode(regs)) {
 		if (!fixup_exception(regs)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
@@ -737,62 +734,20 @@ static void math_error(struct pt_regs *r
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
-	unlazy_fpu(task);
-	task->thread.trap_nr = trapnr;
+	fpu__save(fpu);
+
+	task->thread.trap_nr	= trapnr;
 	task->thread.error_code = error_code;
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
-	if (trapnr == X86_TRAP_MF) {
-		unsigned short cwd, swd;
-		/*
-		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
-		 * status.  0x3f is the exception bits in these regs, 0x200 is the
-		 * C1 reg you need in case of a stack fault, 0x040 is the stack
-		 * fault bit.  We should only be taking one exception at a time,
-		 * so if this combination doesn't produce any single exception,
-		 * then we have a bad program that isn't synchronizing its FPU usage
-		 * and it will suffer the consequences since we won't be able to
-		 * fully reproduce the context of the exception
-		 */
-		cwd = get_fpu_cwd(task);
-		swd = get_fpu_swd(task);
+	info.si_signo		= SIGFPE;
+	info.si_errno		= 0;
+	info.si_addr		= (void __user *)uprobe_get_trap_addr(regs);
 
-		err = swd & ~cwd;
-	} else {
-		/*
-		 * The SIMD FPU exceptions are handled a little differently, as there
-		 * is only a single status/control register.  Thus, to determine which
-		 * unmasked exception was caught we must mask the exception mask bits
-		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-		 */
-		unsigned short mxcsr = get_fpu_mxcsr(task);
-		err = ~(mxcsr >> 7) & mxcsr;
-	}
+	info.si_code = fpu__exception_code(fpu, trapnr);
 
-	if (err & 0x001) {	/* Invalid op */
-		/*
-		 * swd & 0x240 == 0x040: Stack Underflow
-		 * swd & 0x240 == 0x240: Stack Overflow
-		 * User must clear the SF bit (0x40) if set
-		 */
-		info.si_code = FPE_FLTINV;
-	} else if (err & 0x004) { /* Divide by Zero */
-		info.si_code = FPE_FLTDIV;
-	} else if (err & 0x008) { /* Overflow */
-		info.si_code = FPE_FLTOVF;
-	} else if (err & 0x012) { /* Denormal, Underflow */
-		info.si_code = FPE_FLTUND;
-	} else if (err & 0x020) { /* Precision */
-		info.si_code = FPE_FLTRES;
-	} else {
-		/*
-		 * If we're using IRQ 13, or supposedly even some trap
-		 * X86_TRAP_MF implementations, it's possible
-		 * we get a spurious trap, which is not an error.
-		 */
+	/* Retry when we get spurious exceptions: */
+	if (!info.si_code)
 		return;
-	}
+
 	force_sig_info(SIGFPE, &info, task);
 }
 
@@ -820,70 +775,9 @@ dotraplinkage void
 do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 {
 	conditional_sti(regs);
-#if 0
-	/* No need to warn about this any longer. */
-	pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-#endif
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void)
-{
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void)
-{
 }
 #endif /* CONFIG_XEN */
 
-/*
- * 'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
- * Don't touch unless you *really* know how it works.
- *
- * Must be called with kernel preemption disabled (eg with local
- * local interrupts as in the case of do_device_not_available).
- */
-static void _math_state_restore(void)
-{
-	struct task_struct *tsk = current;
-
-	if (!tsk_used_math(tsk)) {
-		stts();
-		local_irq_enable();
-		/*
-		 * does a slab alloc which can sleep
-		 */
-		if (init_fpu(tsk)) {
-			/*
-			 * ran out of memory!
-			 */
-			do_group_exit(SIGKILL);
-			return;
-		}
-		local_irq_disable();
-		clts();
-	}
-
-	/* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */
-	kernel_fpu_disable();
-	xen_thread_fpu_begin(tsk, NULL);
-	if (unlikely(restore_fpu_checking(tsk))) {
-		fpu_reset_state(tsk);
-		force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
-	} else {
-		tsk->thread.fpu_counter++;
-	}
-	kernel_fpu_enable();
-}
-
-void math_state_restore(void)
-{
-	clts();
-	_math_state_restore();
-}
-
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
@@ -906,7 +800,7 @@ do_device_not_available(struct pt_regs *
 #endif
 	/* NB. 'clts' is done for us by Xen during virtual trap. */
 	__this_cpu_and(xen_x86_cr0, ~X86_CR0_TS);
-	_math_state_restore(); /* interrupts still off */
+	fpu__restore(&current->thread.fpu); /* interrupts still off */
 #ifdef CONFIG_X86_32
 	conditional_sti(regs);
 #endif
@@ -978,9 +872,9 @@ static const trap_info_t trap_table[] =
 	{ X86_TRAP_XF, 0|X, __KERNEL_CS, (unsigned long)simd_coprocessor_error	},
 #ifdef CONFIG_X86_32
 	{ X86_TRAP_SPURIOUS, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment	},
-	{ SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call	},
+	{ IA32_SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)entry_INT80_32	},
 #elif defined(CONFIG_IA32_EMULATION)
-	{ IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall },
+	{ IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)entry_INT80_compat },
 #endif
 	{ }
 };
--- a/arch/x86/lib/cmpxchg16b_emu-xen.S
+++ b/arch/x86/lib/cmpxchg16b_emu-xen.S
@@ -6,7 +6,6 @@
  *
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/irqflags.h>
 #include <asm/percpu.h>
 #include <xen/interface/xen.h>
@@ -23,7 +22,6 @@
  * %al  : Operation successful
  */
 ENTRY(this_cpu_cmpxchg16b_emu)
-	CFI_STARTPROC
 
 #
 # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
@@ -34,8 +32,7 @@ ENTRY(this_cpu_cmpxchg16b_emu)
 # *atomic* on a single cpu (as provided by the this_cpu_xx class of
 # macros).
 #
-	pushq_cfi %rbp
-	CFI_REL_OFFSET rbp, 0
+	pushq %rbp
 #ifndef __REG_si
 # error Out of sync with asm/irqflags.h!
 #endif
@@ -43,8 +40,7 @@ ENTRY(this_cpu_cmpxchg16b_emu)
 #define __REG_si %rbp
 #define esi ebp
 	GET_VCPU_INFO
-	pushq_cfi %rdi
-	CFI_REL_OFFSET rdi, 0
+	pushq %rdi
 	__SAVE_INTERRUPTS(edi)
 	__DISABLE_INTERRUPTS
 
@@ -61,31 +57,24 @@ ENTRY(this_cpu_cmpxchg16b_emu)
 
 	test %edi, %edi
 	jz .Lcheck_events
-	CFI_REMEMBER_STATE
 .Lexit:
-	popq_cfi %rdi
-	CFI_RESTORE rdi
-	popq_cfi %rbp
-	CFI_RESTORE rbp
+	popq %rdi
+	popq %rbp
 	ret
 
-	CFI_RESTORE_STATE
 .Lcheck_events:
 	__ENABLE_INTERRUPTS
 	__TEST_PENDING
 	jz .Lexit
 #undef __REG_si
 #undef esi
-	pushq_cfi %rsi
-	CFI_REL_OFFSET rsi, 0
-	pushq_cfi %rax
+	pushq %rsi
+	pushq %rax
 	/* %edi is zero already */
 	xor %esi, %esi
 	call hypercall_page + __HYPERVISOR_xen_version * 32
-	popq_cfi %rax
-	popq_cfi %rsi
-	CFI_RESTORE rsi
+	popq %rax
+	popq %rsi
 	jmp .Lexit
 
-	CFI_ENDPROC
 ENDPROC(this_cpu_cmpxchg16b_emu)
--- a/arch/x86/mm/fault-xen.c
+++ b/arch/x86/mm/fault-xen.c
@@ -13,6 +13,7 @@
 #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
 #include <linux/prefetch.h>		/* prefetchw			*/
 #include <linux/context_tracking.h>	/* exception_enter(), ...	*/
+#include <linux/uaccess.h>		/* faulthandler_disabled()	*/
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
@@ -1152,9 +1153,9 @@ __do_page_fault(struct pt_regs *regs, un
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
-	 * in an atomic region then we must not take the fault:
+	 * in a region with pagefaults disabled then we must not take the fault
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(faulthandler_disabled() || !mm)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
--- a/arch/x86/mm/highmem_32-xen.c
+++ b/arch/x86/mm/highmem_32-xen.c
@@ -35,7 +35,7 @@ void *kmap_atomic_prot(struct page *page
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 
 	if (!PageHighMem(page))
@@ -100,6 +100,7 @@ void __kunmap_atomic(void *kvaddr)
 #endif
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
--- a/arch/x86/mm/init_32-xen.c
+++ b/arch/x86/mm/init_32-xen.c
@@ -469,7 +469,7 @@ void __init add_highpages_with_active_re
 	phys_addr_t start, end;
 	u64 i;
 
-	for_each_free_mem_range(i, nid, &start, &end, NULL) {
+	for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
 		unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
 					    start_pfn, end_pfn);
 		unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
--- a/arch/x86/mm/iomap_32-xen.c
+++ b/arch/x86/mm/iomap_32-xen.c
@@ -60,6 +60,7 @@ void *kmap_atomic_prot_pfn(unsigned long
 	unsigned long vaddr;
 	int idx, type;
 
+	preempt_disable();
 	pagefault_disable();
 
 	type = kmap_atomic_idx_push();
@@ -78,13 +79,13 @@ void __iomem *
 iomap_atomic_prot_pfn(unsigned long mfn, pgprot_t prot)
 {
 	/*
-	 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
-	 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
-	 * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
-	 * user, which is "WC if the MTRR is WC, UC if you can't do that."
+	 * For non-PAT systems, translate non-WB request to UC- just in
+	 * case the caller set the PWT bit to prot directly without using
+	 * pgprot_writecombine(). UC- translates to uncached if the MTRR
+	 * is UC or WC. UC- gets the real intention, of the user, which is
+	 * "WC if the MTRR is WC, UC if you can't do that."
 	 */
-	if (!pat_enabled && pgprot_val(prot) ==
-	    (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC)))
+	if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB)
 		prot = __pgprot(__PAGE_KERNEL |
 				cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
 
@@ -119,5 +120,6 @@ iounmap_atomic(void __iomem *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(iounmap_atomic);
--- a/arch/x86/mm/ioremap-xen.c
+++ b/arch/x86/mm/ioremap-xen.c
@@ -185,6 +185,9 @@ static int ioremap_change_attr(unsigned
 	case _PAGE_CACHE_MODE_WC:
 		err = _set_memory_wc(vaddr, nrpages);
 		break;
+	case _PAGE_CACHE_MODE_WT:
+		err = _set_memory_wt(vaddr, nrpages);
+		break;
 	case _PAGE_CACHE_MODE_WB:
 		err = _set_memory_wb(vaddr, nrpages);
 		break;
@@ -212,6 +215,25 @@ int ioremap_check_change_attr(unsigned l
 	return rc;
 }
 
+static int __ioremap_check_ram(unsigned long start_mfn, unsigned long nr_pages,
+			       void *arg)
+{
+	domid_t *domid = arg;
+	unsigned long i;
+
+	for (i = 0; i < nr_pages; ++i) {
+		unsigned long pfn = mfn_to_local_pfn(start_mfn + i);
+
+		if (pfn_valid(pfn)) {
+			if (!PageReserved(pfn_to_page(pfn)))
+				return 1;
+			*domid = DOMID_SELF;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Remap an arbitrary physical address space into the kernel virtual
  * address space. It transparently creates kernel huge I/O mapping when
@@ -239,7 +261,6 @@ static void __iomem *__ioremap_caller(re
 	int retval;
 	domid_t domid = DOMID_IO;
 	void __iomem *ret_addr;
-	int ram_region;
 
 	/* Don't allow wraparound or zero size */
 	last_addr = phys_addr + size - 1;
@@ -262,29 +283,18 @@ static void __iomem *__ioremap_caller(re
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
-	/* First check if whole region can be identified as RAM or not */
-	ram_region = is_initial_xendomain() ? region_is_ram(phys_addr, size)
-					    : -1;
-	if (ram_region > 0) {
-		WARN_ONCE(1, "ioremap on RAM at %#Lx - %#Lx\n",
-			  (unsigned long long)phys_addr,
-			  (unsigned long long)last_addr);
+	mfn      = phys_addr >> PAGE_SHIFT;
+	last_mfn = last_addr >> PAGE_SHIFT;
+	if (is_initial_xendomain())
+		retval = walk_system_ram_range(mfn, last_mfn - mfn + 1, &domid,
+					       __ioremap_check_ram);
+	else
+		retval = __ioremap_check_ram(mfn, last_mfn - mfn + 1, &domid);
+	if (retval == 1) {
+		WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
+			  &phys_addr, &last_addr);
 		return NULL;
 	}
-
-	/* If could not be identified(-1), check page by page */
-	if (ram_region < 0) {
-		last_mfn = PFN_DOWN(last_addr);
-		for (mfn = PFN_DOWN(phys_addr); mfn <= last_mfn; mfn++) {
-			unsigned long pfn = mfn_to_local_pfn(mfn);
-
-			if (pfn_valid(pfn)) {
-				if (!PageReserved(pfn_to_page(pfn)))
-					return NULL;
-				domid = DOMID_SELF;
-			}
-		}
-	}
 	WARN_ON_ONCE(domid == DOMID_SELF);
 
 	/*
@@ -328,6 +338,10 @@ static void __iomem *__ioremap_caller(re
 		prot = __pgprot(pgprot_val(prot) |
 				cachemode2protval(_PAGE_CACHE_MODE_WC));
 		break;
+	case _PAGE_CACHE_MODE_WT:
+		prot = __pgprot(pgprot_val(prot) |
+				cachemode2protval(_PAGE_CACHE_MODE_WT));
+		break;
 	case _PAGE_CACHE_MODE_WB:
 		break;
 	}
@@ -391,10 +405,11 @@ void __iomem *ioremap_nocache(resource_s
 {
 	/*
 	 * Ideally, this should be:
-	 *	pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
+	 *	pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
 	 *
 	 * Till we fix all X drivers to use ioremap_wc(), we will use
-	 * UC MINUS.
+	 * UC MINUS. Drivers that are certain they need or can already
+	 * be converted over to strong UC can use ioremap_uc().
 	 */
 	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
@@ -404,6 +419,39 @@ void __iomem *ioremap_nocache(resource_s
 EXPORT_SYMBOL(ioremap_nocache);
 
 /**
+ * ioremap_uc     -   map bus memory into CPU space as strongly uncachable
+ * @phys_addr:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_uc performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked with a strong
+ * preference as completely uncachable on the CPU when possible. For non-PAT
+ * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT
+ * systems this will set the PAT entry for the pages as strong UC.  This call
+ * will honor existing caching rules from things like the PCI bus. Note that
+ * there are other caches and buffers on many busses. In particular driver
+ * authors should read up on PCI writes.
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
+{
+	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
+
+	return __ioremap_caller(phys_addr, size, pcm,
+				__builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(ioremap_uc);
+
+/**
  * ioremap_wc	-	map memory into CPU space write combined
  * @phys_addr:	bus address of the memory
  * @size:	size of the resource to map
@@ -415,14 +463,28 @@ EXPORT_SYMBOL(ioremap_nocache);
  */
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
-	if (pat_enabled)
-		return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
+	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
 					__builtin_return_address(0));
-	else
-		return ioremap_nocache(phys_addr, size);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
+/**
+ * ioremap_wt	-	map memory into CPU space write through
+ * @phys_addr:	bus address of the memory
+ * @size:	size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write through.
+ * Write through stores data into memory while keeping the cache up-to-date.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
+{
+	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
+					__builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
 	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
@@ -488,7 +550,7 @@ void iounmap(volatile void __iomem *addr
 EXPORT_SYMBOL(iounmap);
 
 #ifndef CONFIG_XEN
-int arch_ioremap_pud_supported(void)
+int __init arch_ioremap_pud_supported(void)
 {
 #ifdef CONFIG_X86_64
 	return cpu_has_gbpages;
@@ -497,7 +559,7 @@ int arch_ioremap_pud_supported(void)
 #endif
 }
 
-int arch_ioremap_pmd_supported(void)
+int __init arch_ioremap_pmd_supported(void)
 {
 	return cpu_has_pse;
 }
@@ -510,18 +572,18 @@ void *xlate_dev_mem_ptr(phys_addr_t phys
 {
 	unsigned long start  = phys &  PAGE_MASK;
 	unsigned long offset = phys & ~PAGE_MASK;
-	unsigned long vaddr;
+	void *vaddr;
 
 	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
 	if (page_is_ram(start >> PAGE_SHIFT))
 		return __va(phys);
 
-	vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE);
+	vaddr = ioremap_cache(start, PAGE_SIZE);
 	/* Only add the offset on success and return NULL if the ioremap() failed: */
 	if (vaddr)
 		vaddr += offset;
 
-	return (void *)vaddr;
+	return vaddr;
 }
 
 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
@@ -530,7 +592,6 @@ void unxlate_dev_mem_ptr(phys_addr_t phy
 		return;
 
 	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
-	return;
 }
 #endif
 
--- a/arch/x86/mm/pageattr-xen.c
+++ b/arch/x86/mm/pageattr-xen.c
@@ -14,6 +14,7 @@
 #include <linux/percpu.h>
 #include <linux/gfp.h>
 #include <linux/pci.h>
+#include <linux/vmalloc.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -129,16 +130,15 @@ within(unsigned long addr, unsigned long
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
-	void *vend = vaddr + size - 1;
+	unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+	void *vend = vaddr + size;
+	void *p;
 
 	mb();
 
-	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
-		clflushopt(vaddr);
-	/*
-	 * Flush any possible final partial cacheline:
-	 */
-	clflushopt(vend);
+	for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+	     p < vend; p += boot_cpu_data.x86_clflush_size)
+		clflushopt(p);
 
 	mb();
 }
@@ -418,13 +418,11 @@ phys_addr_t slow_virt_to_phys(void *__vi
 	phys_addr_t phys_addr;
 	unsigned long offset;
 	enum pg_level level;
-	unsigned long psize;
 	unsigned long pmask;
 	pte_t *pte;
 
 	pte = lookup_address(virt_addr, &level);
 	BUG_ON(!pte);
-	psize = page_level_size(level);
 	pmask = page_level_mask(level);
 	offset = virt_addr & ~pmask;
 	phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
@@ -1559,6 +1557,9 @@ int _set_memory_uc(unsigned long addr, i
 {
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
+	 * If you really need strong UC use ioremap_uc(), but note
+	 * that you cannot override IO areas with set_memory_*() as
+	 * these helpers cannot work with IO memory.
 	 */
 	return change_page_attr_set(&addr, numpages,
 				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
@@ -1593,12 +1594,10 @@ EXPORT_SYMBOL(set_memory_uc);
 static int _set_memory_array(unsigned long *addr, int addrinarray,
 		enum page_cache_mode new_type)
 {
+	enum page_cache_mode set_type;
 	int i, j;
 	int ret;
 
-	/*
-	 * for now UC MINUS. see comments in ioremap_nocache()
-	 */
 	for (i = 0; i < addrinarray; i++) {
 		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
 					new_type, NULL);
@@ -1606,9 +1605,12 @@ static int _set_memory_array(unsigned lo
 			goto out_free;
 	}
 
+	/* If WC, set to UC- first and then WC */
+	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+				_PAGE_CACHE_MODE_UC_MINUS : new_type;
+
 	ret = change_page_attr_set(addr, addrinarray,
-				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
-				   1);
+				   cachemode2pgprot(set_type), 1);
 
 	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
 		ret = change_page_attr_set_clr(addr, addrinarray,
@@ -1640,6 +1642,12 @@ int set_memory_array_wc(unsigned long *a
 }
 EXPORT_SYMBOL(set_memory_array_wc);
 
+int set_memory_array_wt(unsigned long *addr, int addrinarray)
+{
+	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_memory_array_wt);
+
 int _set_memory_wc(unsigned long addr, int numpages)
 {
 	int ret;
@@ -1662,27 +1670,42 @@ int set_memory_wc(unsigned long addr, in
 {
 	int ret;
 
-	if (!pat_enabled)
-		return set_memory_uc(addr, numpages);
-
 	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
 		_PAGE_CACHE_MODE_WC, NULL);
 	if (ret)
-		goto out_err;
+		return ret;
 
 	ret = _set_memory_wc(addr, numpages);
 	if (ret)
-		goto out_free;
-
-	return 0;
+		free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
 
-out_free:
-	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
-out_err:
 	return ret;
 }
 EXPORT_SYMBOL(set_memory_wc);
 
+int _set_memory_wt(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(&addr, numpages,
+				    cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
+}
+
+int set_memory_wt(unsigned long addr, int numpages)
+{
+	int ret;
+
+	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+			      _PAGE_CACHE_MODE_WT, NULL);
+	if (ret)
+		return ret;
+
+	ret = _set_memory_wt(addr, numpages);
+	if (ret)
+		free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(set_memory_wt);
+
 int _set_memory_wb(unsigned long addr, int numpages)
 {
 	/* WB cache mode is hard wired to all cache attribute bits being 0 */
@@ -1773,6 +1796,7 @@ static int _set_pages_array(struct page
 {
 	unsigned long start;
 	unsigned long end;
+	enum page_cache_mode set_type;
 	int i;
 	int free_idx;
 	int ret;
@@ -1786,8 +1810,12 @@ static int _set_pages_array(struct page
 			goto err_out;
 	}
 
+	/* If WC, set to UC- first and then WC */
+	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+				_PAGE_CACHE_MODE_UC_MINUS : new_type;
+
 	ret = cpa_set_pages_array(pages, addrinarray,
-			cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS));
+				  cachemode2pgprot(set_type));
 	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
 		ret = change_page_attr_set_clr(NULL, addrinarray,
 					       cachemode2pgprot(
@@ -1821,6 +1849,12 @@ int set_pages_array_wc(struct page **pag
 }
 EXPORT_SYMBOL(set_pages_array_wc);
 
+int set_pages_array_wt(struct page **pages, int addrinarray)
+{
+	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_pages_array_wt);
+
 int set_pages_wb(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
--- a/arch/x86/mm/pat-xen.c
+++ b/arch/x86/mm/pat-xen.c
@@ -33,13 +33,17 @@
 #include "pat_internal.h"
 #include "mm_internal.h"
 
-#ifdef CONFIG_X86_PAT
-int __read_mostly pat_enabled = 1;
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static bool boot_cpu_done;
+
+static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT);
 
 static inline void pat_disable(const char *reason)
 {
-	pat_enabled = 0;
-	printk(KERN_INFO "%s\n", reason);
+	__pat_enabled = 0;
+	pr_info("x86/PAT: %s\n", reason);
 }
 
 static int __init nopat(char *str)
@@ -48,13 +52,12 @@ static int __init nopat(char *str)
 	return 0;
 }
 early_param("nopat", nopat);
-#else
-static inline void pat_disable(const char *reason)
+
+bool pat_enabled(void)
 {
-	(void)reason;
+	return !!__pat_enabled;
 }
-#endif
-
+EXPORT_SYMBOL_GPL(pat_enabled);
 
 int pat_debug_enable;
 
@@ -65,22 +68,24 @@ static int __init pat_debug_setup(char *
 }
 __setup("debugpat", pat_debug_setup);
 
-static u64 __read_mostly boot_pat_state;
-
 #ifdef CONFIG_X86_PAT
 /*
- * X86 PAT uses page flags WC and Uncached together to keep track of
- * memory type of pages that have backing page struct. X86 PAT supports 3
- * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and
- * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not
- * been changed from its default (value of -1 used to denote this).
- * Note we do not support _PAGE_CACHE_MODE_UC here.
+ * X86 PAT uses page flags arch_1 and uncached together to keep track of
+ * memory type of pages that have backing page struct.
+ *
+ * X86 PAT supports 4 different memory types:
+ *  - _PAGE_CACHE_MODE_WB
+ *  - _PAGE_CACHE_MODE_WC
+ *  - _PAGE_CACHE_MODE_UC_MINUS
+ *  - _PAGE_CACHE_MODE_WT
+ *
+ * _PAGE_CACHE_MODE_WB is the default type.
  */
 
-#define _PGMT_DEFAULT		0
+#define _PGMT_WB		0
 #define _PGMT_WC		(1UL << PG_arch_1)
 #define _PGMT_UC_MINUS		(1UL << PG_uncached)
-#define _PGMT_WB		(1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_WT		(1UL << PG_uncached | 1UL << PG_arch_1)
 #define _PGMT_MASK		(1UL << PG_uncached | 1UL << PG_arch_1)
 #define _PGMT_CLEAR_MASK	(~_PGMT_MASK)
 
@@ -88,14 +93,14 @@ static inline enum page_cache_mode get_p
 {
 	unsigned long pg_flags = pg->flags & _PGMT_MASK;
 
-	if (pg_flags == _PGMT_DEFAULT)
-		return -1;
+	if (pg_flags == _PGMT_WB)
+		return _PAGE_CACHE_MODE_WB;
 	else if (pg_flags == _PGMT_WC)
 		return _PAGE_CACHE_MODE_WC;
 	else if (pg_flags == _PGMT_UC_MINUS)
 		return _PAGE_CACHE_MODE_UC_MINUS;
 	else
-		return _PAGE_CACHE_MODE_WB;
+		return _PAGE_CACHE_MODE_WT;
 }
 
 static inline void set_page_memtype(struct page *pg,
@@ -112,11 +117,12 @@ static inline void set_page_memtype(stru
 	case _PAGE_CACHE_MODE_UC_MINUS:
 		memtype_flags = _PGMT_UC_MINUS;
 		break;
-	case _PAGE_CACHE_MODE_WB:
-		memtype_flags = _PGMT_WB;
+	case _PAGE_CACHE_MODE_WT:
+		memtype_flags = _PGMT_WT;
 		break;
+	case _PAGE_CACHE_MODE_WB:
 	default:
-		memtype_flags = _PGMT_DEFAULT;
+		memtype_flags = _PGMT_WB;
 		break;
 	}
 
@@ -174,88 +180,162 @@ static enum page_cache_mode pat_get_cach
  * configuration.
  * Using lower indices is preferred, so we start with highest index.
  */
-void pat_init_cache_modes(void)
+void pat_init_cache_modes(u64 pat)
 {
-	int i;
 	enum page_cache_mode cache;
 	char pat_msg[33];
-	u64 pat;
+	int i;
 
-	rdmsrl(MSR_IA32_CR_PAT, pat);
 	pat_msg[32] = 0;
 	for (i = 7; i >= 0; i--) {
 		cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
 					   pat_msg + 4 * i);
 		update_cache_mode_entry(i, cache);
 	}
-	pr_info("PAT configuration [0-7]: %s\n", pat_msg);
+	pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
 }
 
 #define PAT(x, y)	((u64)PAT_ ## y << ((x)*8))
 
-void pat_init(void)
+static void pat_bsp_init(u64 pat)
 {
-	u64 pat;
-	bool boot_cpu = !boot_pat_state;
+	u64 tmp_pat;
 
-	if (!pat_enabled)
+	if (!cpu_has_pat) {
+		pat_disable("PAT not supported by CPU.");
 		return;
+	}
 
-	if (!cpu_has_pat) {
-		if (!boot_pat_state) {
-			pat_disable("PAT not supported by CPU.");
-			return;
-		} else {
-			/*
-			 * If this happens we are on a secondary CPU, but
-			 * switched to PAT on the boot CPU. We have no way to
-			 * undo PAT.
-			 */
-			printk(KERN_ERR "PAT enabled, "
-			       "but not supported by secondary CPU\n");
-			BUG();
-		}
+	if (!pat_enabled())
+		goto done;
+
+	rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
+	if (!tmp_pat) {
+		pat_disable("PAT MSR is 0, disabled.");
+		return;
 	}
 
 #ifndef CONFIG_XEN
-	/* Set PWT to Write-Combining. All other bits stay the same */
 	/*
-	 * PTE encoding used in Linux:
-	 *      PAT
-	 *      |PCD
-	 *      ||PWT
-	 *      |||
-	 *      000 WB		_PAGE_CACHE_WB
-	 *      001 WC		_PAGE_CACHE_WC
-	 *      010 UC-		_PAGE_CACHE_UC_MINUS
-	 *      011 UC		_PAGE_CACHE_UC
-	 * PAT bit unused
+	 * PAT settings are part of the hypervisor interface, and their
+	 * assignment cannot be changed.
 	 */
-	pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
-	      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+	wrmsrl(MSR_IA32_CR_PAT, pat);
+#endif
 
-	/* Boot CPU check */
-	if (!boot_pat_state) {
-		rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
-		if (!boot_pat_state) {
-			pat_disable("PAT read returns always zero, disabled.");
-			return;
-		}
+done:
+	pat_init_cache_modes(pat);
+}
+
+static void pat_ap_init(u64 pat)
+{
+	if (!pat_enabled())
+		return;
+
+	if (!cpu_has_pat) {
+		/*
+		 * If this happens we are on a secondary CPU, but switched to
+		 * PAT on the boot CPU. We have no way to undo PAT.
+		 */
+		panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
 	}
 
+#ifndef CONFIG_XEN
 	wrmsrl(MSR_IA32_CR_PAT, pat);
-#else
-	/*
-	 * PAT settings are part of the hypervisor interface, and their
-	 * assignment cannot be changed.
-	 */
-	rdmsrl(MSR_IA32_CR_PAT, pat);
-	if (!boot_pat_state)
-		boot_pat_state = pat;
 #endif
+}
+
+void pat_init(void)
+{
+	u64 pat;
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	if (!pat_enabled()) {
+		/*
+		 * No PAT. Emulate the PAT table that corresponds to the two
+		 * cache bits, PWT (Write Through) and PCD (Cache Disable). This
+		 * setup is the same as the BIOS default setup when the system
+		 * has PAT but the "nopat" boot option has been specified. This
+		 * emulated PAT table is used when MSR_IA32_CR_PAT returns 0.
+		 *
+		 * PTE encoding:
+		 *
+		 *       PCD
+		 *       |PWT  PAT
+		 *       ||    slot
+		 *       00    0    WB : _PAGE_CACHE_MODE_WB
+		 *       01    1    WT : _PAGE_CACHE_MODE_WT
+		 *       10    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
+		 *       11    3    UC : _PAGE_CACHE_MODE_UC
+		 *
+		 * NOTE: When WC or WP is used, it is redirected to UC- per
+		 * the default setup in __cachemode2pte_tbl[].
+		 */
+		pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
+		      PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
 
-	if (boot_cpu)
-		pat_init_cache_modes();
+	} else if ((c->x86_vendor == X86_VENDOR_INTEL) &&
+		   (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
+		    ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
+		/*
+		 * PAT support with the lower four entries. Intel Pentium 2,
+		 * 3, M, and 4 are affected by PAT errata, which makes the
+		 * upper four entries unusable. To be on the safe side, we don't
+		 * use those.
+		 *
+		 *  PTE encoding:
+		 *      PAT
+		 *      |PCD
+		 *      ||PWT  PAT
+		 *      |||    slot
+		 *      000    0    WB : _PAGE_CACHE_MODE_WB
+		 *      001    1    WC : _PAGE_CACHE_MODE_WC
+		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
+		 *      011    3    UC : _PAGE_CACHE_MODE_UC
+		 * PAT bit unused
+		 *
+		 * NOTE: When WT or WP is used, it is redirected to UC- per
+		 * the default setup in __cachemode2pte_tbl[].
+		 */
+		pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+		      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+	} else {
+		/*
+		 * Full PAT support.  We put WT in slot 7 to improve
+		 * robustness in the presence of errata that might cause
+		 * the high PAT bit to be ignored.  This way, a buggy slot 7
+		 * access will hit slot 3, and slot 3 is UC, so at worst
+		 * we lose performance without causing a correctness issue.
+		 * Pentium 4 erratum N46 is an example for such an erratum,
+		 * although we try not to use PAT at all on affected CPUs.
+		 *
+		 *  PTE encoding:
+		 *      PAT
+		 *      |PCD
+		 *      ||PWT  PAT
+		 *      |||    slot
+		 *      000    0    WB : _PAGE_CACHE_MODE_WB
+		 *      001    1    WC : _PAGE_CACHE_MODE_WC
+		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
+		 *      011    3    UC : _PAGE_CACHE_MODE_UC
+		 *      100    4    WB : Reserved
+		 *      101    5    WC : Reserved
+		 *      110    6    UC-: Reserved
+		 *      111    7    WT : _PAGE_CACHE_MODE_WT
+		 *
+		 * The reserved slots are unused, but mapped to their
+		 * corresponding types in the presence of PAT errata.
+		 */
+		pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+		      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
+	}
+
+	if (!boot_cpu_done) {
+		pat_bsp_init(pat);
+		boot_cpu_done = true;
+	} else {
+		pat_ap_init(pat);
+	}
 }
 
 #undef PAT
@@ -263,12 +343,17 @@ void pat_init(void)
 static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype accesses */
 
 static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end);
-static inline u8 _mtrr_type_lookup(u64 start, u64 end)
+static inline u8 _mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
 {
+	int ret;
+
 	if (is_initial_xendomain())
-		return mtrr_type_lookup(start, end);
-	return pat_pagerange_is_ram(start, end) > 0
-	       ? MTRR_TYPE_WRCOMB : MTRR_TYPE_UNCACHABLE;
+		return mtrr_type_lookup(start, end, uniform);
+
+	ret = pat_pagerange_is_ram(start, end);
+	*uniform = ret >= 0;
+
+	return ret > 0 ? MTRR_TYPE_WRCOMB : MTRR_TYPE_UNCACHABLE;
 }
 #define mtrr_type_lookup _mtrr_type_lookup
 
@@ -287,9 +372,9 @@ static unsigned long pat_x_mtrr_type(u64
 	 * request is for WB.
 	 */
 	if (req_type == _PAGE_CACHE_MODE_WB) {
-		u8 mtrr_type;
+		u8 mtrr_type, uniform;
 
-		mtrr_type = mtrr_type_lookup(start, end);
+		mtrr_type = mtrr_type_lookup(start, end, &uniform);
 		if (mtrr_type != MTRR_TYPE_WRBACK)
 			return _PAGE_CACHE_MODE_UC_MINUS;
 
@@ -328,9 +413,14 @@ static int pat_pagerange_is_ram(resource
 
 /*
  * For RAM pages, we use page flags to mark the pages with appropriate type.
- * Here we do two pass:
- * - Find the memtype of all the pages in the range, look for any conflicts
- * - In case of no conflicts, set the new memtype for pages in the range
+ * The page flags are limited to four types, WB (default), WC, WT and UC-.
+ * WP request fails with -EINVAL, and UC gets redirected to UC-.  Setting
+ * a new memory type is only allowed for a page mapped with the default WB
+ * type.
+ *
+ * Here we do two passes:
+ * - Find the memtype of all the pages in the range, look for any conflicts.
+ * - In case of no conflicts, set the new memtype for pages in the range.
  */
 static int reserve_ram_pages_type(u64 start, u64 end,
 				  enum page_cache_mode req_type,
@@ -339,6 +429,12 @@ static int reserve_ram_pages_type(u64 st
 	struct page *page;
 	unsigned long mfn;
 
+	if (req_type == _PAGE_CACHE_MODE_WP) {
+		if (new_type)
+			*new_type = _PAGE_CACHE_MODE_UC_MINUS;
+		return -EINVAL;
+	}
+
 	if (req_type == _PAGE_CACHE_MODE_UC) {
 		/* We do not support strong UC */
 		WARN_ON_ONCE(1);
@@ -352,8 +448,8 @@ static int reserve_ram_pages_type(u64 st
 		BUG_ON(!pfn_valid(pfn));
 		page = pfn_to_page(pfn);
 		type = get_page_memtype(page);
-		if (type != -1) {
-			pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
+		if (type != _PAGE_CACHE_MODE_WB) {
+			pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
 				start, end - 1, type, req_type);
 			if (new_type)
 				*new_type = type;
@@ -382,7 +478,7 @@ static int free_ram_pages_type(u64 start
 
 		BUG_ON(!pfn_valid(pfn));
 		page = pfn_to_page(pfn);
-		set_page_memtype(page, -1);
+		set_page_memtype(page, _PAGE_CACHE_MODE_WB);
 	}
 	return 0;
 }
@@ -393,6 +489,7 @@ static int free_ram_pages_type(u64 start
  * - _PAGE_CACHE_MODE_WC
  * - _PAGE_CACHE_MODE_UC_MINUS
  * - _PAGE_CACHE_MODE_UC
+ * - _PAGE_CACHE_MODE_WT
  *
  * If new_type is NULL, function will return an error if it cannot reserve the
  * region with req_type. If new_type is non-NULL, function will return
@@ -409,14 +506,10 @@ int reserve_memtype(u64 start, u64 end,
 
 	BUG_ON(start >= end); /* end is exclusive */
 
-	if (!pat_enabled) {
+	if (!pat_enabled()) {
 		/* This is identical to page table setting without PAT */
-		if (new_type) {
-			if (req_type == _PAGE_CACHE_MODE_WC)
-				*new_type = _PAGE_CACHE_MODE_UC_MINUS;
-			else
-				*new_type = req_type;
-		}
+		if (new_type)
+			*new_type = req_type;
 		return 0;
 	}
 
@@ -460,9 +553,9 @@ int reserve_memtype(u64 start, u64 end,
 
 	err = rbt_memtype_check_insert(new, new_type);
 	if (err) {
-		printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
-		       start, end - 1,
-		       cattr_name(new->type), cattr_name(req_type));
+		pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+			start, end - 1,
+			cattr_name(new->type), cattr_name(req_type));
 		kfree(new);
 		spin_unlock(&memtype_lock);
 
@@ -484,7 +577,7 @@ int free_memtype(u64 start, u64 end)
 	int is_range_ram;
 	struct memtype *entry;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 
 	/* Low ISA region is always mapped WB. No need to track */
@@ -506,8 +599,8 @@ int free_memtype(u64 start, u64 end)
 	spin_unlock(&memtype_lock);
 
 	if (!entry) {
-		printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
-		       current->comm, current->pid, start, end - 1);
+		pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+			current->comm, current->pid, start, end - 1);
 		return -EINVAL;
 	}
 
@@ -527,7 +620,7 @@ int free_memtype(u64 start, u64 end)
  * Only to be called when PAT is enabled
  *
  * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
- * or _PAGE_CACHE_MODE_UC
+ * or _PAGE_CACHE_MODE_WT.
  */
 static enum page_cache_mode lookup_memtype(u64 paddr)
 {
@@ -539,16 +632,9 @@ static enum page_cache_mode lookup_memty
 
 	if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
 		struct page *page;
-		page = pfn_to_page(paddr >> PAGE_SHIFT);
-		rettype = get_page_memtype(page);
-		/*
-		 * -1 from get_page_memtype() implies RAM page is in its
-		 * default state and not reserved, and hence of type WB
-		 */
-		if (rettype == -1)
-			rettype = _PAGE_CACHE_MODE_WB;
 
-		return rettype;
+		page = pfn_to_page(paddr >> PAGE_SHIFT);
+		return get_page_memtype(page);
 	}
 
 	spin_lock(&memtype_lock);
@@ -634,13 +720,13 @@ static inline int range_is_allowed(unsig
 	u64 to = from + size;
 	u64 cursor = from;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 1;
 
 	while (cursor < to) {
 		if (!devmem_is_allowed(mfn)) {
-			printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
-			       current->comm, from, to - 1);
+			pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
+				current->comm, from, to - 1);
 			return 0;
 		}
 		cursor += PAGE_SIZE;
@@ -671,7 +757,7 @@ int phys_mem_access_prot_allowed(struct
 	 * caching for the high addresses through the KEN pin, but
 	 * we maintain the tradition of paranoia in this code.
 	 */
-	if (!pat_enabled &&
+	if (!pat_enabled() &&
 	    !(boot_cpu_has(X86_FEATURE_MTRR) ||
 	      boot_cpu_has(X86_FEATURE_K6_MTRR) ||
 	      boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
@@ -719,12 +805,12 @@ static int reserve_pfn_range(u64 paddr,
 	 * the type requested matches the type of first page in the range.
 	 */
 	if (is_ram) {
-		if (!pat_enabled)
+		if (!pat_enabled())
 			return 0;
 
 		pcm = lookup_memtype(paddr);
 		if (want_pcm != pcm) {
-			printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
+			pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
 				current->comm, current->pid,
 				cattr_name(want_pcm),
 				(unsigned long long)paddr,
@@ -745,13 +831,12 @@ static int reserve_pfn_range(u64 paddr,
 		if (strict_prot ||
 		    !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
 			free_memtype(paddr, paddr + size);
-			printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
-				" for [mem %#010Lx-%#010Lx], got %s\n",
-				current->comm, current->pid,
-				cattr_name(want_pcm),
-				(unsigned long long)paddr,
-				(unsigned long long)(paddr + size - 1),
-				cattr_name(pcm));
+			pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
+			       current->comm, current->pid,
+			       cattr_name(want_pcm),
+			       (unsigned long long)paddr,
+			       (unsigned long long)(paddr + size - 1),
+			       cattr_name(pcm));
 			return -EINVAL;
 		}
 		/*
@@ -834,7 +919,7 @@ int track_pfn_remap(struct vm_area_struc
 		return ret;
 	}
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 
 	/*
@@ -862,7 +947,7 @@ int track_pfn_insert(struct vm_area_stru
 {
 	enum page_cache_mode pcm;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 
 	/* Set prot based on lookup */
@@ -904,14 +989,18 @@ void untrack_pfn(struct vm_area_struct *
 
 pgprot_t pgprot_writecombine(pgprot_t prot)
 {
-	if (pat_enabled)
-		return __pgprot(pgprot_val(prot) |
+	return __pgprot(pgprot_val(prot) |
 				cachemode2protval(_PAGE_CACHE_MODE_WC));
-	else
-		return pgprot_noncached(prot);
 }
 EXPORT_SYMBOL_GPL(pgprot_writecombine);
 
+pgprot_t pgprot_writethrough(pgprot_t prot)
+{
+	return __pgprot(pgprot_val(prot) |
+				cachemode2protval(_PAGE_CACHE_MODE_WT));
+}
+EXPORT_SYMBOL_GPL(pgprot_writethrough);
+
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
 
 static struct memtype *memtype_get_idx(loff_t pos)
@@ -987,7 +1076,7 @@ static const struct file_operations memt
 
 static int __init pat_memtype_list_init(void)
 {
-	if (pat_enabled) {
+	if (pat_enabled()) {
 		debugfs_create_file("pat_memtype_list", S_IRUSR,
 				    arch_debugfs_dir, NULL, &memtype_fops);
 	}
--- a/arch/x86/mm/pgtable-xen.c
+++ b/arch/x86/mm/pgtable-xen.c
@@ -994,16 +994,31 @@ void xen_set_fixmap(enum fixed_addresses
 }
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+/**
+ * pud_set_huge - setup kernel PUD mapping
+ *
+ * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
+ * function sets up a huge page only if any of the following conditions are met:
+ *
+ * - MTRRs are disabled, or
+ *
+ * - MTRRs are enabled and the range is completely covered by a single MTRR, or
+ *
+ * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
+ *   has no effect on the requested PAT memory type.
+ *
+ * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
+ * page mapping attempt fails.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 {
-	u8 mtrr;
+	u8 mtrr, uniform;
 
-	/*
-	 * Do not use a huge page when the range is covered by non-WB type
-	 * of MTRRs.
-	 */
-	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
-	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
+	if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+	    (mtrr != MTRR_TYPE_WRBACK))
 		return 0;
 
 	prot = pgprot_4k_2_large(prot);
@@ -1015,17 +1030,24 @@ int pud_set_huge(pud_t *pud, phys_addr_t
 	return 1;
 }
 
+/**
+ * pmd_set_huge - setup kernel PMD mapping
+ *
+ * See text over pud_set_huge() above.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 {
-	u8 mtrr;
+	u8 mtrr, uniform;
 
-	/*
-	 * Do not use a huge page when the range is covered by non-WB type
-	 * of MTRRs.
-	 */
-	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
-	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
+	if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+	    (mtrr != MTRR_TYPE_WRBACK)) {
+		pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
+			     __func__, addr, addr + PMD_SIZE);
 		return 0;
+	}
 
 	prot = pgprot_4k_2_large(prot);
 
@@ -1036,6 +1058,11 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t
 	return 1;
 }
 
+/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
 int pud_clear_huge(pud_t *pud)
 {
 	if (pud_large(*pud)) {
@@ -1046,6 +1073,11 @@ int pud_clear_huge(pud_t *pud)
 	return 0;
 }
 
+/**
+ * pmd_clear_huge - clear kernel PMD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PMD map is found).
+ */
 int pmd_clear_huge(pmd_t *pmd)
 {
 	if (pmd_large(*pmd)) {
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -285,11 +285,12 @@ static int acpi_processor_get_info(struc
 	 */
 	if (invalid_logical_cpuid(pr->id)) {
 		int ret = acpi_processor_hotadd_init(pr);
-		if (ret && (ret != -ENODEV || pr->phys_id == -1))
+		if (ret && (ret != -ENODEV || invalid_phys_cpuid(pr->phys_id)))
 			return ret;
 	}
 #if defined(CONFIG_SMP) && defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
-	if (pr->id >= setup_max_cpus && pr->id > 0)
+	if (pr->id >= setup_max_cpus && pr->id
+	    && !invalid_logical_cpuid(pr->id))
 		pr->id = -1;
 #endif
 
@@ -302,7 +303,7 @@ static int acpi_processor_get_info(struc
 	 * generated as the following format:
 	 * CPU+CPU ID.
 	 */
-	if (pr->id != -1)
+	if (!invalid_logical_cpuid(pr->id))
 		sprintf(acpi_device_bid(device), "CPU%X", pr->id);
 	else
 		snprintf(acpi_device_bid(device),
@@ -341,7 +342,7 @@ static int acpi_processor_get_info(struc
 	 * of /proc/cpuinfo
 	 */
 	status = acpi_evaluate_integer(pr->handle, "_SUN", NULL, &value);
-	if (ACPI_SUCCESS(status) && pr->id != -1)
+	if (ACPI_SUCCESS(status) && !invalid_logical_cpuid(pr->id))
 		arch_fix_phys_package_id(pr->id, value);
 
 	return 0;
@@ -385,14 +386,14 @@ static int acpi_processor_add(struct acp
 
 	result = acpi_processor_get_info(device);
 	if (result || /* Processor is not physically present or unavailable */
-	    ((pr->id == -1) && !processor_cntl_external()))
+	    (invalid_logical_cpuid(pr->id) && !processor_cntl_external()))
 		return 0;
 
 #ifdef CONFIG_SMP
 	if (pr->id >= setup_max_cpus && pr->id != 0) {
 		if (!processor_cntl_external())
 			return 0;
-		WARN_ON(pr->id != -1);
+		WARN_ON(!invalid_logical_cpuid(pr->id));
 	}
 #endif
 
@@ -441,7 +442,7 @@ static int acpi_processor_add(struct acp
 #ifndef CONFIG_XEN
 	per_cpu(processor_device_array, pr->id) = device;
 #else
-	if (pr->id != -1) {
+	if (!invalid_logical_cpuid(pr->id)) {
 #endif
 	per_cpu(processors, pr->id) = pr;
 
@@ -471,7 +472,7 @@ static int acpi_processor_add(struct acp
 	free_cpumask_var(pr->throttling.shared_cpu_map);
 	device->driver_data = NULL;
 #ifdef CONFIG_XEN
-	if (pr->id != -1)
+	if (!invalid_logical_cpuid(pr->id))
 #endif
 	per_cpu(processors, pr->id) = NULL;
  err_free_pr:
@@ -503,7 +504,7 @@ static void acpi_processor_remove(struct
 	 * Unbind the driver from the processor device and detach it from the
 	 * ACPI companion object.
 	 */
-	if (pr->id != -1) {
+	if (!invalid_logical_cpuid(pr->id)) {
 		device_release_driver(pr->dev);
 		acpi_unbind_one(pr->dev);
 	}
@@ -517,7 +518,7 @@ static void acpi_processor_remove(struct
 	mutex_lock(&processor_device_mutex);
 	radix_tree_delete(&processor_device_tree, pr->acpi_id);
 	mutex_unlock(&processor_device_mutex);
-	if (pr->id != -1)
+	if (!invalid_logical_cpuid(pr->id))
 		per_cpu(processors, pr->id) = NULL;
 	goto out;
 #else
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -168,12 +168,12 @@ static int acpi_pss_perf_init(struct acp
 	acpi_processor_ppc_has_changed(pr, 0);
 
 	/*
-	 * pr->id may equal to -1 while processor_cntl_external enabled.
+	 * pr->id may be invalid when processor_cntl_external is enabled.
 	 * throttle and thermal module don't support this case.
 	 * Tx only works when dom0 vcpu == pcpu num by far, as we give
 	 * control to dom0.
 	 */
-	if (pr->id != -1) {
+	if (!invalid_logical_cpuid(pr->id)) {
 		acpi_processor_get_throttling_info(pr);
 
 		if (pr->flags.throttling)
--- a/drivers/pci/msi-xen.c
+++ b/drivers/pci/msi-xen.c
@@ -58,27 +58,6 @@ struct msi_dev_list {
 
 /* Arch hooks */
 
-static void msi_set_enable(struct pci_dev *dev, int enable)
-{
-	u16 control;
-
-	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
-	control &= ~PCI_MSI_FLAGS_ENABLE;
-	if (enable)
-		control |= PCI_MSI_FLAGS_ENABLE;
-	pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
-}
-
-static void msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u16 set)
-{
-	u16 ctrl;
-
-	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
-	ctrl &= ~clear;
-	ctrl |= set;
-	pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, ctrl);
-}
-
 static int (*get_owner)(struct pci_dev *dev);
 
 static domid_t msi_get_dev_owner(struct pci_dev *dev)
@@ -326,11 +305,11 @@ void pci_restore_msi_state(struct pci_de
 
 	pci_intx_for_msi(dev, 0);
 	if (dev->msi_enabled)
-		msi_set_enable(dev, 0);
+		pci_msi_set_enable(dev, 0);
 	if (dev->msix_enabled)
-		msix_clear_and_set_ctrl(dev, 0,
-					PCI_MSIX_FLAGS_ENABLE |
-					PCI_MSIX_FLAGS_MASKALL);
+		pci_msix_clear_and_set_ctrl(dev, 0,
+					    PCI_MSIX_FLAGS_ENABLE |
+					    PCI_MSIX_FLAGS_MASKALL);
 
 	if (pci_seg_supported) {
 		struct physdev_pci_device restore = {
@@ -356,7 +335,7 @@ void pci_restore_msi_state(struct pci_de
 	WARN(rc && rc != -ENOSYS, "restore_msi -> %d\n", rc);
 
 	if (dev->msix_enabled)
-		msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
+		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
 }
 EXPORT_SYMBOL_GPL(pci_restore_msi_state);
 
@@ -555,7 +534,7 @@ static int msi_capability_init(struct pc
 	struct msi_dev_list *dev_entry = get_msi_dev_pirq_list(dev);
 	int pirq;
 
-	msi_set_enable(dev, 0);	/* Disable MSI during set up */
+	pci_msi_set_enable(dev, 0);	/* Disable MSI during set up */
 
 	pirq = msi_map_vector(dev, nvec, 0, dev_entry->owner);
 	if (pirq < 0)
@@ -564,7 +543,7 @@ static int msi_capability_init(struct pc
 
 	/* Set MSI enabled bits	 */
 	pci_intx_for_msi(dev, 0);
-	msi_set_enable(dev, 1);
+	pci_msi_set_enable(dev, 1);
 	dev->msi_enabled = 1;
 
 	dev->irq = dev_entry->e.pirq = pirq;
@@ -594,7 +573,7 @@ static int msix_capability_init(struct p
 		return -ENOMEM;
 
 	/* Ensure MSI-X is disabled while it is set up */
-	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
 
 	table_base = find_table_base(dev);
 	if (!table_base)
@@ -605,7 +584,7 @@ static int msix_capability_init(struct p
 	 * MSI-X registers.  We need to mask all the vectors to prevent
 	 * interrupts coming in before they're fully set up.
 	 */
-	msix_clear_and_set_ctrl(dev, 0,
+	pci_msix_clear_and_set_ctrl(dev, 0,
 				PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE);
 
 	for (i = 0; i < nvec; i++) {
@@ -654,7 +633,7 @@ static int msix_capability_init(struct p
 	dev->msix_enabled = 1;
 	populate_msi_sysfs(dev);
 
-	msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
+	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
 
 	return 0;
 }
@@ -755,7 +734,7 @@ void pci_msi_shutdown(struct pci_dev *de
 	msi_dev_entry->owner = DOMID_IO;
 
 	/* Disable MSI mode */
-	msi_set_enable(dev, 0);
+	pci_msi_set_enable(dev, 0);
 	pci_intx_for_msi(dev, 1);
 	dev->msi_enabled = 0;
 }
@@ -928,7 +907,7 @@ void pci_msix_shutdown(struct pci_dev *d
 
 	/* Disable MSI mode */
 	if (is_initial_xendomain()) {
-		msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
 		pci_intx_for_msi(dev, 1);
 	}
 	dev->msix_enabled = 0;
@@ -960,26 +939,6 @@ EXPORT_SYMBOL(pci_msi_enabled);
 void pci_msi_init_pci_dev(struct pci_dev *dev)
 {
 	INIT_LIST_HEAD(&dev->msi_list);
-
-	/* Disable the msi hardware to avoid screaming interrupts
-	 * during boot.  This is the power on reset default so
-	 * usually this should be a noop.
-	 * But on a Xen host don't do this for
-	 * - IOMMUs which the hypervisor is in control of (and hence has
-	 *   already enabled on purpose),
-	 * - unprivileged domains.
-	 */
-	if (is_initial_xendomain()
-	    && (dev->class >> 8) == PCI_CLASS_SYSTEM_IOMMU
-	    && dev->vendor == PCI_VENDOR_ID_AMD)
-		return;
-	dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
-	if (dev->msi_cap && is_initial_xendomain())
-		msi_set_enable(dev, 0);
-
-	dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	if (dev->msix_cap && is_initial_xendomain())
-		msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
 }
 
 /**
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/cpumask.h>
 #include <linux/pci-aspm.h>
+#include <xen/xen.h>
 #include <asm-generic/pci-bridge.h>
 #include "pci.h"
 
@@ -1148,13 +1149,22 @@ void pci_msi_setup_pci_dev(struct pci_de
 	 * Disable the MSI hardware to avoid screaming interrupts
 	 * during boot.  This is the power on reset default so
 	 * usually this should be a noop.
+	 *
+	 * But on a Xen host don't do this for
+	 * - IOMMUs which the hypervisor is in control of (and hence has
+	 *   already enabled on purpose),
+	 * - unprivileged domains.
 	 */
+	if (xen_initial_domain()
+	    && (dev->class >> 8) == PCI_CLASS_SYSTEM_IOMMU
+	    && dev->vendor == PCI_VENDOR_ID_AMD)
+		return;
 	dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
-	if (dev->msi_cap)
+	if (dev->msi_cap && xen_initial_domain())
 		pci_msi_set_enable(dev, 0);
 
 	dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	if (dev->msix_cap)
+	if (dev->msix_cap && xen_initial_domain())
 		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
 }
 
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -327,11 +327,9 @@ config XEN_SMPBOOT
 
 config XEN_SPINLOCK_ACQUIRE_NESTING
 	int "maximum nesting level for acquiring spin locks"
-	depends on SMP
+	depends on SMP && !QUEUED_SPINLOCKS
 	# Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
-	depends on !XEN_COMPAT_030002_AND_LATER
-	depends on !XEN_COMPAT_030004_AND_LATER
-	depends on !XEN_COMPAT_030100_AND_LATER
+	depends on XEN_COMPAT >= 0x030200
 	range 0 3
 	default 0
 	help
--- a/drivers/xen/core/evtchn.c
+++ b/drivers/xen/core/evtchn.c
@@ -1261,10 +1261,82 @@ void irq_resume(void)
 }
 #endif
 
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+#include <asm/irqdomain.h>
+
+struct irq_domain *xen_irq_domain;
+
+static void xen_free_irqs(struct irq_domain *domain,
+			  unsigned int virq, unsigned int nr_irqs)
+{
+	unsigned int i;
+
+	/* XXX x86_vector_free_irqs() uses x86_vector_domain here. */
+	WARN_ON_ONCE(domain != xen_irq_domain);
+	for (i = 0; i < nr_irqs; i++) {
+		struct irq_data *data;
+
+		data = irq_domain_get_irq_data(domain, virq + i);
+		if (data && data->chip_data) {
+			if (virq + i >= ARRAY_SIZE(_irq_cfg))
+				kfree(data->chip_data);
+			irq_domain_reset_irq_data(data);
+		}
+	}
+}
+
+static int xen_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+			  unsigned int nr_irqs, void *arg)
+{
+	unsigned int i;
+#ifdef CONFIG_X86
+	struct irq_alloc_info *info = arg;
+
+	/* Currently this allocator can't guarantee contiguous allocations. */
+	if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
+		return -ENOSYS;
+#endif
+
+	for (i = 0; i < nr_irqs; i++) {
+		struct irq_data *data;
+		struct irq_cfg *cfg;
+
+		data = irq_domain_get_irq_data(domain, virq + i);
+		BUG_ON(!data);
+
+		if (virq + i < ARRAY_SIZE(_irq_cfg))
+			cfg = _irq_cfg + virq + i;
+		else
+			cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+		if (!cfg) {
+			xen_free_irqs(domain, virq, i);
+			return -ENOMEM;
+		}
+
+		data->chip_data = cfg;
+		data->hwirq = virq + i;
+	}
+
+	return 0;
+}
+
+static const struct irq_domain_ops xen_irq_domain_ops = {
+	.alloc	= xen_alloc_irqs,
+	.free	= xen_free_irqs,
+};
+#endif
+
 int __init arch_early_irq_init(void)
 {
 	unsigned int i;
 
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+	xen_irq_domain = irq_domain_add_tree(NULL, &xen_irq_domain_ops,
+						NULL);
+	BUG_ON(xen_irq_domain == NULL);
+	irq_set_default_host(xen_irq_domain);
+#endif
+
 	for (i = 0; i < ARRAY_SIZE(_irq_cfg); i++)
 		irq_set_chip_data(i, _irq_cfg + i);
 
@@ -1273,6 +1345,12 @@ int __init arch_early_irq_init(void)
 
 struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
 {
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+	struct irq_alloc_info info = {};
+	int res = __irq_domain_alloc_irqs(NULL, at, 1, node, &info, false);
+
+	return res >= 0 || res == -EEXIST ? irq_cfg(at) : NULL;
+#else
 	int res = irq_alloc_desc_at(at, node);
 	struct irq_cfg *cfg = NULL;
 
@@ -1299,7 +1377,8 @@ struct irq_cfg *alloc_irq_and_cfg_at(uns
 	return cfg;
 #else
 	return irq_cfg(at);
-#endif
+#endif /* CONFIG_SPARSE_IRQ */
+#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
 }
 
 #ifdef CONFIG_SPARSE_IRQ
--- a/drivers/xen/core/gnttab.c
+++ b/drivers/xen/core/gnttab.c
@@ -37,6 +37,7 @@
 #include <linux/mm.h>
 #include <linux/seqlock.h>
 #include <linux/timer.h>
+#include <linux/vmalloc.h>
 #include <xen/interface/xen.h>
 #include <xen/gnttab.h>
 #include <asm/pgtable.h>
--- a/drivers/xen/core/smpboot.c
+++ b/drivers/xen/core/smpboot.c
@@ -16,6 +16,7 @@
 #include <linux/percpu.h>
 #include <linux/tick.h>
 #include <asm/desc.h>
+#include <asm/proto.h>
 #include <asm/pgalloc.h>
 #include <xen/evtchn.h>
 #include <xen/interface/vcpu.h>
@@ -27,7 +28,6 @@ extern void local_teardown_timer(unsigne
 
 extern void hypervisor_callback(void);
 extern void failsafe_callback(void);
-extern void system_call(void);
 extern void smp_trap_init(trap_info_t *);
 
 cpumask_var_t vcpu_initialized_mask;
@@ -246,7 +246,7 @@ static void cpu_initialize_context(unsig
 	ctxt.user_regs.fs = __KERNEL_PERCPU;
 	ctxt.user_regs.gs = __KERNEL_STACK_CANARY;
 #else /* __x86_64__ */
-	ctxt.syscall_callback_eip  = (unsigned long)system_call;
+	ctxt.syscall_callback_eip  = (unsigned long)entry_SYSCALL_64;
 
 	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
 
@@ -390,12 +390,12 @@ int __cpu_up(unsigned int cpu, struct ta
 
 #ifdef CONFIG_X86_64
 	clear_tsk_thread_flag(idle, TIF_FORK);
+	per_cpu(cpu_sp0, cpu) = (unsigned long)task_stack_page(idle) +
+				THREAD_SIZE;
 #else
 	per_cpu(cpu_current_top_of_stack, cpu) =
 		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
 #endif
-	per_cpu(kernel_stack, cpu) = (unsigned long)task_stack_page(idle) +
-				     THREAD_SIZE;
  	per_cpu(current_task, cpu) = idle;
 
 	cpu_initialize_context(cpu, idle->thread.sp0);
--- a/drivers/xen/pcifront/pci_op.c
+++ b/drivers/xen/pcifront/pci_op.c
@@ -5,10 +5,10 @@
  */
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/ktime.h>
 #include <linux/pci.h>
 #include <linux/spinlock.h>
 #include <asm/bitops.h>
-#include <linux/time.h>
 #include <xen/evtchn.h>
 #include "pcifront.h"
 
@@ -173,7 +173,6 @@ static int do_pci_op(struct pcifront_dev
 	unsigned long irq_flags;
 	evtchn_port_t port = pdev->evtchn;
 	s64 ns, ns_timeout;
-	struct timeval tv;
 
 	spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
 
@@ -190,8 +189,7 @@ static int do_pci_op(struct pcifront_dev
 	 * (in the latter case we end up continually re-executing poll() with a
 	 * timeout in the past). 1s difference gives plenty of slack for error.
 	 */
-	do_gettimeofday(&tv);
-	ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC;
+	ns_timeout = ktime_get_ns() + 2 * (s64)NSEC_PER_SEC;
 
 	clear_evtchn(port);
 
@@ -200,8 +198,7 @@ static int do_pci_op(struct pcifront_dev
 		if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ))
 			BUG();
 		clear_evtchn(port);
-		do_gettimeofday(&tv);
-		ns = timeval_to_ns(&tv);
+		ns = ktime_get_ns();
 		if (ns > ns_timeout) {
 			dev_err(&pdev->xdev->dev,
 				"pciback not responding!!!\n");
@@ -434,9 +431,15 @@ int pcifront_scan_root(struct pcifront_d
 		       unsigned int domain, unsigned int bus)
 {
 	struct pci_bus *b;
+	LIST_HEAD(resources);
 	struct pcifront_sd *sd;
 	struct pci_bus_entry *bus_entry;
 	int err = 0;
+	static struct resource busn_res = {
+		.start = 0,
+		.end = 255,
+		.flags = IORESOURCE_BUS,
+	};
 
 #ifndef CONFIG_PCI_DOMAINS
 	if (domain != 0) {
@@ -457,14 +460,18 @@ int pcifront_scan_root(struct pcifront_d
 		err = -ENOMEM;
 		goto err_out;
 	}
+	pci_add_resource(&resources, &ioport_resource);
+	pci_add_resource(&resources, &iomem_resource);
+	pci_add_resource(&resources, &busn_res);
 	pcifront_init_sd(sd, domain, bus, pdev);
 
 	pci_lock_rescan_remove();
 
-	b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
-				  &pcifront_bus_ops, sd);
+	b = pci_scan_root_bus(&pdev->xdev->dev, bus,
+			      &pcifront_bus_ops, sd, &resources);
 	if (!b) {
 		pci_unlock_rescan_remove();
+		pci_free_resource_list(&resources);
 		dev_err(&pdev->xdev->dev,
 			"Error creating PCI Frontend Bus!\n");
 		err = -ENOMEM;
--- a/drivers/xen/sfc_netutil/accel_util.c
+++ b/drivers/xen/sfc_netutil/accel_util.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/if_ether.h>
 #include <linux/module.h>
+#include <linux/vmalloc.h>
 #include <asm/io.h>
 #include <asm/pgtable.h>
 #include <asm/hypercall.h>
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -15,6 +15,7 @@
 #include <xen/events.h>
 #include <asm/xen/pci.h>
 #else
+#include <linux/vmalloc.h>
 #include <xen/evtchn.h>
 #endif
 #include "pciback.h"
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -41,7 +41,7 @@
 #include <linux/vmalloc.h>
 #include <linux/export.h>
 #include <asm/xen/hypervisor.h>
-#include <asm/xen/page.h>
+#include <xen/page.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/event_channel.h>
 #include <xen/balloon.h>
@@ -411,16 +411,16 @@ int xenbus_grant_ring(struct xenbus_devi
 	int i, j;
 
 	for (i = 0; i < nr_pages; i++) {
-		unsigned long addr = (unsigned long)vaddr +
-			(PAGE_SIZE * i);
 		err = gnttab_grant_foreign_access(dev->otherend_id,
-						  virt_to_mfn(addr), 0);
+						  virt_to_mfn(vaddr), 0);
 		if (err < 0) {
 			xenbus_dev_fatal(dev, err,
 					 "granting access to ring page");
 			goto fail;
 		}
 		grefs[i] = err;
+
+		vaddr = vaddr + PAGE_SIZE;
 	}
 
 	return 0;
@@ -845,8 +845,10 @@ static int xenbus_unmap_ring_vfree_hvm(s
 
 	rv = xenbus_unmap_ring(dev, node->handles, node->nr_handles,
 			       addrs);
-	if (!rv)
+	if (!rv) {
 		vunmap(vaddr);
+		free_xenballooned_pages(node->nr_handles, node->hvm.pages);
+	}
 	else
 		WARN(1, "Leaking %p, size %u page(s)\n", vaddr,
 		     node->nr_handles);
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -1317,7 +1317,7 @@ static int xenbus_resume_cb(struct notif
 	int err = 0;
 
 	if (xen_hvm_domain()) {
-		uint64_t v;
+		uint64_t v = 0;
 
 		err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
 		if (!err && v)
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -167,6 +167,8 @@ struct msi_controller {
 			 struct msi_desc *desc);
 	void (*teardown_irq)(struct msi_controller *chip, unsigned int irq);
 };
+#else /* CONFIG_XEN */
+struct msi_msg;
 #endif /* CONFIG_XEN */
 
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
--- a/include/xen/evtchn.h
+++ b/include/xen/evtchn.h
@@ -61,6 +61,7 @@ struct irq_cfg {
 };
 struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
 static inline int evtchn_make_refcounted(unsigned int evtchn) { return 0; }
+extern struct irq_domain *xen_irq_domain;
 #endif
 
 /*
--- a/kernel/configs/xen.config
+++ b/kernel/configs/xen.config
@@ -18,26 +18,26 @@ CONFIG_TTY=y
 CONFIG_INET=y
 CONFIG_BINFMT_ELF=y
 # generic config
-CONFIG_XEN=y
+CONFIG_PARAVIRT_XEN=y
 CONFIG_XEN_DOM0=y
 # backend drivers
-CONFIG_XEN_BACKEND=y
-CONFIG_XEN_BLKDEV_BACKEND=m
-CONFIG_XEN_NETDEV_BACKEND=m
+CONFIG_PARAVIRT_XEN_BACKEND=y
+CONFIG_PARAVIRT_XEN_BLKDEV_BACKEND=m
+CONFIG_PARAVIRT_XEN_NETDEV_BACKEND=m
 CONFIG_HVC_XEN=y
 CONFIG_XEN_WDT=m
-CONFIG_XEN_SCSI_BACKEND=m
+CONFIG_PARAVIRT_XEN_SCSI_BACKEND=m
 # frontend drivers
 CONFIG_XEN_FBDEV_FRONTEND=m
 CONFIG_HVC_XEN_FRONTEND=y
 CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m
-CONFIG_XEN_SCSI_FRONTEND=m
+CONFIG_PARAVIRT_XEN_BLKDEV_FRONTEND=m
+CONFIG_PARAVIRT_XEN_NETDEV_FRONTEND=m
+CONFIG_PARAVIRT_XEN_SCSI_FRONTEND=m
 # others
 CONFIG_XEN_BALLOON=y
 CONFIG_XEN_SCRUB_PAGES=y
 CONFIG_XEN_DEV_EVTCHN=m
-CONFIG_XEN_BLKDEV_FRONTEND=m
-CONFIG_XEN_NETDEV_FRONTEND=m
 CONFIG_XENFS=m
 CONFIG_XEN_COMPAT_XENFS=y
 CONFIG_XEN_SYS_HYPERVISOR=y
--- a/lib/swiotlb-xen.c
+++ b/lib/swiotlb-xen.c
@@ -26,6 +26,7 @@
 #include <linux/iommu-helper.h>
 #include <linux/highmem.h>
 #include <linux/gfp.h>
+#include <linux/scatterlist.h>
 
 #include <asm/io.h>
 #include <asm/pci.h>