Blob Blame History Raw
From: www.kernel.org
Subject: Linux 2.6.19
Patch-mainline: 2.6.19

Automatically created from "patches.kernel.org/patch-2.6.19" by xen-port-patches.py

Acked-by: jbeulich@novell.com

Index: head-2008-07-15/arch/x86/Kconfig
===================================================================
--- head-2008-07-15.orig/arch/x86/Kconfig	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/Kconfig	2008-07-15 14:24:17.000000000 +0200
@@ -389,6 +389,7 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER
 
 menuconfig PARAVIRT_GUEST
 	bool "Paravirtualized guest support"
+	depends on !X86_XEN && !X86_64_XEN
 	help
 	  Say Y here to get to see options related to running Linux under
 	  various hypervisors.  This option alone does not add any kernel code.
Index: head-2008-07-15/arch/x86/kernel/apic_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/apic_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/apic_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -54,7 +54,6 @@ static cpumask_t timer_bcast_ipi;
 /*
  * Knob to control our willingness to enable the local APIC.
  */
-int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
 
 /*
  * Debug level
@@ -102,7 +101,7 @@ int get_physical_broadcast(void)
 
 #ifndef CONFIG_XEN
 #ifndef CONFIG_SMP
-static void up_apic_timer_interrupt_call(struct pt_regs *regs)
+static void up_apic_timer_interrupt_call(void)
 {
 	int cpu = smp_processor_id();
 
@@ -111,11 +110,11 @@ static void up_apic_timer_interrupt_call
 	 */
 	per_cpu(irq_stat, cpu).apic_timer_irqs++;
 
-	smp_local_timer_interrupt(regs);
+	smp_local_timer_interrupt();
 }
 #endif
 
-void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
+void smp_send_timer_broadcast_ipi(void)
 {
 	cpumask_t mask;
 
@@ -128,7 +127,7 @@ void smp_send_timer_broadcast_ipi(struct
 		 * We can directly call the apic timer interrupt handler
 		 * in UP case. Minus all irq related functions
 		 */
-		up_apic_timer_interrupt_call(regs);
+		up_apic_timer_interrupt_call();
 #endif
 	}
 }
Index: head-2008-07-15/arch/x86/kernel/cpu/common-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/cpu/common-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/cpu/common-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -43,7 +43,7 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM
 
 extern int disable_pse;
 
-static void default_init(struct cpuinfo_x86 * c)
+static void __cpuinit default_init(struct cpuinfo_x86 * c)
 {
 	/* Not much we can do here... */
 	/* Check if at least it has cpuid */
@@ -56,7 +56,7 @@ static void default_init(struct cpuinfo_
 	}
 }
 
-static struct cpu_dev default_cpu = {
+static struct cpu_dev __cpuinitdata default_cpu = {
 	.c_init	= default_init,
 	.c_vendor = "Unknown",
 };
@@ -191,7 +191,16 @@ static void __cpuinit get_cpu_vendor(str
 
 static int __init x86_fxsr_setup(char * s)
 {
+	/* Tell all the other CPU's to not use it... */
 	disable_x86_fxsr = 1;
+
+	/*
+	 * ... and clear the bits early in the boot_cpu_data
+	 * so that the bootup process doesn't try to do this
+	 * either.
+	 */
+	clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
+	clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
 	return 1;
 }
 __setup("nofxsr", x86_fxsr_setup);
@@ -272,7 +281,7 @@ static void __init early_cpu_detect(void
 	}
 }
 
-void __cpuinit generic_identify(struct cpuinfo_x86 * c)
+static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 {
 	u32 tfms, xlvl;
 	int ebx;
@@ -698,8 +707,7 @@ old_gdt:
 	 */
 	atomic_inc(&init_mm.mm_count);
 	current->active_mm = &init_mm;
-	if (current->mm)
-		BUG();
+	BUG_ON(current->mm);
 	enter_lazy_tlb(&init_mm, current);
 
 	load_esp0(t, thread);
@@ -712,7 +720,7 @@ old_gdt:
 #endif
 
 	/* Clear %fs and %gs. */
-	asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
+	asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
 
 	/* Clear all 6 debug registers: */
 	set_debugreg(0, 0);
Index: head-2008-07-15/arch/x86/kernel/entry_32-xen.S
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/entry_32-xen.S	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/entry_32-xen.S	2008-07-15 14:24:17.000000000 +0200
@@ -80,8 +80,12 @@ VM_MASK		= 0x00020000
 NMI_MASK	= 0x80000000
 
 #ifndef CONFIG_XEN
-#define DISABLE_INTERRUPTS	cli
-#define ENABLE_INTERRUPTS	sti
+/* These are replaces for paravirtualization */
+#define DISABLE_INTERRUPTS		cli
+#define ENABLE_INTERRUPTS		sti
+#define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
+#define INTERRUPT_RETURN		iret
+#define GET_CR0_INTO_EAX		movl %cr0, %eax
 #else
 /* Offsets into shared_info_t. */
 #define evtchn_upcall_pending		/* 0 */
@@ -99,15 +103,29 @@ NMI_MASK	= 0x80000000
 
 #define __DISABLE_INTERRUPTS	movb $1,evtchn_upcall_mask(%esi)
 #define __ENABLE_INTERRUPTS	movb $0,evtchn_upcall_mask(%esi)
+#define __TEST_PENDING		testb $0xFF,evtchn_upcall_pending(%esi)
 #define DISABLE_INTERRUPTS	GET_VCPU_INFO				; \
 				__DISABLE_INTERRUPTS
 #define ENABLE_INTERRUPTS	GET_VCPU_INFO				; \
 				__ENABLE_INTERRUPTS
-#define __TEST_PENDING		testb $0xFF,evtchn_upcall_pending(%esi)
+#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS			; \
+sysexit_scrit:	/**** START OF SYSEXIT CRITICAL REGION ****/		; \
+	__TEST_PENDING							; \
+	jnz  14f		# process more events if necessary...	; \
+	movl ESI(%esp), %esi						; \
+	sysexit								; \
+14:	__DISABLE_INTERRUPTS						; \
+	TRACE_IRQS_OFF							; \
+sysexit_ecrit:	/**** END OF SYSEXIT CRITICAL REGION ****/		; \
+	push %esp							; \
+	call evtchn_do_upcall						; \
+	add  $4,%esp							; \
+	jmp  ret_from_intr
+#define INTERRUPT_RETURN	iret
 #endif
 
 #ifdef CONFIG_PREEMPT
-#define preempt_stop		cli; TRACE_IRQS_OFF
+#define preempt_stop		DISABLE_INTERRUPTS; TRACE_IRQS_OFF
 #else
 #define preempt_stop
 #define resume_kernel		restore_nocheck
@@ -206,18 +224,21 @@ NMI_MASK	= 0x80000000
 
 #define RING0_INT_FRAME \
 	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
 	CFI_DEF_CFA esp, 3*4;\
 	/*CFI_OFFSET cs, -2*4;*/\
 	CFI_OFFSET eip, -3*4
 
 #define RING0_EC_FRAME \
 	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
 	CFI_DEF_CFA esp, 4*4;\
 	/*CFI_OFFSET cs, -2*4;*/\
 	CFI_OFFSET eip, -3*4
 
 #define RING0_PTREGS_FRAME \
 	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
 	CFI_DEF_CFA esp, OLDESP-EBX;\
 	/*CFI_OFFSET cs, CS-OLDESP;*/\
 	CFI_OFFSET eip, EIP-OLDESP;\
@@ -263,8 +284,9 @@ ret_from_intr:
 check_userspace:
 	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
 	movb CS(%esp), %al
-	testl $(VM_MASK | 2), %eax
-	jz resume_kernel
+	andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
+	cmpl $USER_RPL, %eax
+	jb resume_kernel		# not returning to v8086 or userspace
 ENTRY(resume_userspace)
 	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
@@ -277,7 +299,7 @@ ENTRY(resume_userspace)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
-	cli
+	DISABLE_INTERRUPTS
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
@@ -297,6 +319,7 @@ need_resched:
 	# sysenter call handler stub
 ENTRY(sysenter_entry)
 	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA esp, 0
 	CFI_REGISTER esp, ebp
 	movl SYSENTER_stack_esp0(%esp),%esp
@@ -305,7 +328,7 @@ sysenter_past_esp:
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs and here we enable it straight after entry:
 	 */
-	sti
+	ENABLE_INTERRUPTS
 	pushl $(__USER_DS)
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET ss, 0*/
@@ -359,26 +382,8 @@ sysenter_past_esp:
 	movl EIP(%esp), %edx
 	movl OLDESP(%esp), %ecx
 	xorl %ebp,%ebp
-#ifdef CONFIG_XEN
 	TRACE_IRQS_ON
-	__ENABLE_INTERRUPTS
-sysexit_scrit:	/**** START OF SYSEXIT CRITICAL REGION ****/
-	__TEST_PENDING
-	jnz  14f			# process more events if necessary...
-	movl ESI(%esp), %esi
-	sysexit
-14:	__DISABLE_INTERRUPTS
-	TRACE_IRQS_OFF
-sysexit_ecrit:	/**** END OF SYSEXIT CRITICAL REGION ****/
-	push %esp
-	call evtchn_do_upcall
-	add  $4,%esp
-	jmp  ret_from_intr
-#else
-	TRACE_IRQS_ON
-	sti
-	sysexit
-#endif /* !CONFIG_XEN */
+	ENABLE_INTERRUPTS_SYSEXIT
 	CFI_ENDPROC
 
 	# pv sysenter call handler stub
@@ -444,8 +449,8 @@ restore_all:
 	# See comments in process.c:copy_thread() for details.
 	movb OLDSS(%esp), %ah
 	movb CS(%esp), %al
-	andl $(VM_MASK | (4 << 8) | 3), %eax
-	cmpl $((4 << 8) | 3), %eax
+	andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
 	CFI_REMEMBER_STATE
 	je ldt_ss			# returning to user-space with LDT SS
 restore_nocheck:
@@ -467,12 +472,11 @@ restore_nocheck_notrace:
 	RESTORE_REGS
 	addl $4, %esp
 	CFI_ADJUST_CFA_OFFSET -4
-1:	iret
+1:	INTERRUPT_RETURN
 .section .fixup,"ax"
 iret_exc:
 #ifndef CONFIG_XEN
-	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS
 #endif
 	pushl $0			# no error code
 	pushl $do_iret_error
@@ -498,7 +502,7 @@ ldt_ss:
 	 * dosemu and wine happy. */
 	subl $8, %esp		# reserve space for switch16 pointer
 	CFI_ADJUST_CFA_OFFSET 8
-	cli
+	DISABLE_INTERRUPTS
 	TRACE_IRQS_OFF
 	movl %esp, %eax
 	/* Set up the 16bit stack frame with switch32 pointer on top,
@@ -508,7 +512,7 @@ ldt_ss:
 	TRACE_IRQS_IRET
 	RESTORE_REGS
 	lss 20+4(%esp), %esp	# switch to 16bit stack
-1:	iret
+1:	INTERRUPT_RETURN
 .section __ex_table,"a"
 	.align 4
 	.long 1b,iret_exc
@@ -524,7 +528,7 @@ scrit:	/**** START OF CRITICAL REGION **
 	RESTORE_REGS
 	addl $4, %esp
 	CFI_ADJUST_CFA_OFFSET -4
-1:	iret
+1:	INTERRUPT_RETURN
 .section __ex_table,"a"
 	.align 4
 	.long 1b,iret_exc
@@ -713,11 +717,9 @@ ENTRY(name)				\
 #define UNWIND_ESPFIX_STACK
 #endif
 
-ENTRY(divide_error)
-	RING0_INT_FRAME
-	pushl $0			# no error code
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_divide_error
+KPROBE_ENTRY(page_fault)
+	RING0_EC_FRAME
+	pushl $do_page_fault
 	CFI_ADJUST_CFA_OFFSET 4
 	ALIGN
 error_code:
@@ -767,6 +769,7 @@ error_code:
 	call *%edi
 	jmp ret_from_exception
 	CFI_ENDPROC
+KPROBE_END(page_fault)
 
 #ifdef CONFIG_XEN
 # A note on the "critical region" in our callback handler.
@@ -926,7 +929,7 @@ ENTRY(device_not_available)
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 #ifndef CONFIG_XEN
-	movl %cr0, %eax
+	GET_CR0_INTO_EAX
 	testl $0x4, %eax		# EM (math emulation bit)
 	je device_available_emulate
 	pushl $0			# temporary storage for ORIG_EIP
@@ -961,9 +964,15 @@ device_available_emulate:
 	jne ok;					\
 label:						\
 	movl SYSENTER_stack_esp0+offset(%esp),%esp;	\
+	CFI_DEF_CFA esp, 0;			\
+	CFI_UNDEFINED eip;			\
 	pushfl;					\
+	CFI_ADJUST_CFA_OFFSET 4;		\
 	pushl $__KERNEL_CS;			\
-	pushl $sysenter_past_esp
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	pushl $sysenter_past_esp;		\
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	CFI_REL_OFFSET eip, 0
 #endif /* CONFIG_XEN */
 
 KPROBE_ENTRY(debug)
@@ -982,7 +991,8 @@ debug_stack_correct:
 	call do_debug
 	jmp ret_from_exception
 	CFI_ENDPROC
-	.previous .text
+KPROBE_END(debug)
+
 #ifndef CONFIG_XEN
 /*
  * NMI is doubly nasty. It can happen _while_ we're handling
@@ -992,7 +1002,7 @@ debug_stack_correct:
  * check whether we got an NMI on the debug path where the debug
  * fault happened on the sysenter path.
  */
-ENTRY(nmi)
+KPROBE_ENTRY(nmi)
 	RING0_INT_FRAME
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
@@ -1017,6 +1027,7 @@ ENTRY(nmi)
 	cmpl $sysenter_entry,12(%esp)
 	je nmi_debug_stack_check
 nmi_stack_correct:
+	/* We have a RING0_INT_FRAME here */
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
@@ -1027,9 +1038,12 @@ nmi_stack_correct:
 	CFI_ENDPROC
 
 nmi_stack_fixup:
+	RING0_INT_FRAME
 	FIX_STACK(12,nmi_stack_correct, 1)
 	jmp nmi_stack_correct
+
 nmi_debug_stack_check:
+	/* We have a RING0_INT_FRAME here */
 	cmpw $__KERNEL_CS,16(%esp)
 	jne nmi_stack_correct
 	cmpl $debug,(%esp)
@@ -1040,8 +1054,10 @@ nmi_debug_stack_check:
 	jmp nmi_stack_correct
 
 nmi_16bit_stack:
-	RING0_INT_FRAME
-	/* create the pointer to lss back */
+	/* We have a RING0_INT_FRAME here.
+	 *
+	 * create the pointer to lss back
+	 */
 	pushl %ss
 	CFI_ADJUST_CFA_OFFSET 4
 	pushl %esp
@@ -1062,14 +1078,14 @@ nmi_16bit_stack:
 	call do_nmi
 	RESTORE_REGS
 	lss 12+4(%esp), %esp		# back to 16bit stack
-1:	iret
+1:	INTERRUPT_RETURN
 	CFI_ENDPROC
 .section __ex_table,"a"
 	.align 4
 	.long 1b,iret_exc
 .previous
 #else
-ENTRY(nmi)
+KPROBE_ENTRY(nmi)
 	RING0_INT_FRAME
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
@@ -1081,6 +1097,7 @@ ENTRY(nmi)
 	jmp restore_all
 	CFI_ENDPROC
 #endif
+KPROBE_END(nmi)
 
 KPROBE_ENTRY(int3)
 	RING0_INT_FRAME
@@ -1092,7 +1109,7 @@ KPROBE_ENTRY(int3)
 	call do_int3
 	jmp ret_from_exception
 	CFI_ENDPROC
-	.previous .text
+KPROBE_END(int3)
 
 ENTRY(overflow)
 	RING0_INT_FRAME
@@ -1157,7 +1174,7 @@ KPROBE_ENTRY(general_protection)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
-	.previous .text
+KPROBE_END(general_protection)
 
 ENTRY(alignment_check)
 	RING0_EC_FRAME
@@ -1166,13 +1183,14 @@ ENTRY(alignment_check)
 	jmp error_code
 	CFI_ENDPROC
 
-KPROBE_ENTRY(page_fault)
-	RING0_EC_FRAME
-	pushl $do_page_fault
+ENTRY(divide_error)
+	RING0_INT_FRAME
+	pushl $0			# no error code
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_divide_error
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
-	.previous .text
 
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
@@ -1234,6 +1252,19 @@ ENTRY(fixup_4gb_segment)
 	jmp error_code
 	CFI_ENDPROC
 
+ENTRY(kernel_thread_helper)
+	pushl $0		# fake return address for unwinder
+	CFI_STARTPROC
+	movl %edx,%eax
+	push %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	call *%ebx
+	push %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	call do_exit
+	CFI_ENDPROC
+ENDPROC(kernel_thread_helper)
+
 .section .rodata,"a"
 #include "syscall_table.S"
 
Index: head-2008-07-15/arch/x86/kernel/head_32-xen.S
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/head_32-xen.S	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/head_32-xen.S	2008-07-15 14:24:17.000000000 +0200
@@ -62,7 +62,7 @@ ENTRY(startup_32)
 	movl %eax,%gs
 	cld			# gcc2 wants the direction flag cleared at all times
 
-	pushl %eax		# fake return address
+	pushl $0		# fake return address for unwinder
 	jmp start_kernel
 
 #define HYPERCALL_PAGE_OFFSET 0x1000
Index: head-2008-07-15/arch/x86/kernel/io_apic_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/io_apic_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/io_apic_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -31,6 +31,9 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <linux/htirq.h>
 
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -38,13 +41,15 @@
 #include <asm/timer.h>
 #include <asm/i8259.h>
 #include <asm/nmi.h>
+#include <asm/msidef.h>
+#include <asm/hypertransport.h>
 
 #include <mach_apic.h>
+#include <mach_apicdef.h>
 
 #include "io_ports.h"
 
 #ifdef CONFIG_XEN
-
 #include <xen/interface/xen.h>
 #include <xen/interface/physdev.h>
 
@@ -55,32 +60,7 @@
 
 unsigned long io_apic_irqs;
 
-static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
-{
-	struct physdev_apic apic_op;
-	int ret;
-
-	apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
-	apic_op.reg = reg;
-	ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
-	if (ret)
-		return ret;
-	return apic_op.value;
-}
-
-static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	struct physdev_apic apic_op;
-
-	apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
-	apic_op.reg = reg;
-	apic_op.value = value;
-	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
-}
-
-#define io_apic_read(a,r)    xen_io_apic_read(a,r)
-#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
-
+#define clear_IO_APIC() ((void)0)
 #endif /* CONFIG_XEN */
 
 int (*ioapic_renumber_irq)(int ioapic, int irq);
@@ -105,7 +85,7 @@ int sis_apic_bug = -1;
  */
 int nr_ioapic_registers[MAX_IO_APICS];
 
-int disable_timer_pin_1 __initdata;
+static int disable_timer_pin_1 __initdata;
 
 /*
  * Rough estimation of how many shared IRQs there are, can
@@ -125,12 +105,122 @@ static struct irq_pin_list {
 	int apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
-#ifdef CONFIG_PCI_MSI
-#define vector_to_irq(vector) 	\
-	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
+#ifndef CONFIG_XEN
+struct io_apic {
+	unsigned int index;
+	unsigned int unused[3];
+	unsigned int data;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+}
+#endif
+
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+#ifndef CONFIG_XEN
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(reg, &io_apic->index);
+	return readl(&io_apic->data);
+#else
+	struct physdev_apic apic_op;
+	int ret;
+
+	apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+	apic_op.reg = reg;
+	ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
+	if (ret)
+		return ret;
+	return apic_op.value;
+#endif
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+#ifndef CONFIG_XEN
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(reg, &io_apic->index);
+	writel(value, &io_apic->data);
+#else
+	struct physdev_apic apic_op;
+
+	apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+	apic_op.reg = reg;
+	apic_op.value = value;
+	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
+#endif
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
+ */
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+{
+	volatile struct io_apic *io_apic = io_apic_base(apic);
+	if (sis_apic_bug)
+		writel(reg, &io_apic->index);
+	writel(value, &io_apic->data);
+}
 #else
-#define vector_to_irq(vector)	(vector)
+#define io_apic_modify io_apic_write
+#endif
+
+union entry_union {
+	struct { u32 w1, w2; };
+	struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+	union entry_union eu;
+	unsigned long flags;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+	eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+	return eu.entry;
+}
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+	unsigned long flags;
+	union entry_union eu;
+	eu.entry = e;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+	unsigned long flags;
+	union entry_union eu = { .entry.mask = 1 };
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
 #endif
 
 /*
@@ -156,9 +246,7 @@ static void add_pin_to_irq(unsigned int 
 	entry->pin = pin;
 }
 
-#ifdef CONFIG_XEN
-#define clear_IO_APIC() ((void)0)
-#else
+#ifndef CONFIG_XEN
 /*
  * Reroute an IRQ to a different pin.
  */
@@ -243,25 +331,16 @@ static void unmask_IO_APIC_irq (unsigned
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
-	unsigned long flags;
 	
 	/* Check delivery_mode to be sure we're not clearing an SMI pin */
-	spin_lock_irqsave(&ioapic_lock, flags);
-	*(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
-	*(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	entry = ioapic_read_entry(apic, pin);
 	if (entry.delivery_mode == dest_SMI)
 		return;
 
 	/*
 	 * Disable it in the IO-APIC irq-routing table:
 	 */
-	memset(&entry, 0, sizeof(entry));
-	entry.mask = 1;
-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
-	io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_mask_entry(apic, pin);
 }
 
 static void clear_IO_APIC (void)
@@ -301,7 +380,7 @@ static void set_ioapic_affinity_irq(unsi
 			break;
 		entry = irq_2_pin + entry->next;
 	}
-	set_irq_info(irq, cpumask);
+	set_native_irq_info(irq, cpumask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -1207,40 +1286,40 @@ static inline int IO_APIC_irq_trigger(in
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
 u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
 
-int assign_irq_vector(int irq)
+static int __assign_irq_vector(int irq)
 {
-	unsigned long flags;
 	int vector;
 	struct physdev_irq irq_op;
 
-	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
-
-	spin_lock_irqsave(&vector_lock, flags);
+	BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
 
-	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
-		spin_unlock_irqrestore(&vector_lock, flags);
-		return IO_APIC_VECTOR(irq);
-	}
+	if (irq_vector[irq] > 0)
+		return irq_vector[irq];
 
 	irq_op.irq = irq;
-	if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
-		spin_unlock_irqrestore(&vector_lock, flags);
+	if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
 		return -ENOSPC;
-	}
 
 	vector = irq_op.vector;
-	vector_irq[vector] = irq;
-	if (irq != AUTO_ASSIGN)
-		IO_APIC_VECTOR(irq) = vector;
+	irq_vector[irq] = vector;
+
+	return vector;
+}
 
+static int assign_irq_vector(int irq)
+{
+	unsigned long flags;
+	int vector;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	vector = __assign_irq_vector(irq);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
 	return vector;
 }
 
 #ifndef CONFIG_XEN
-static struct hw_interrupt_type ioapic_level_type;
-static struct hw_interrupt_type ioapic_edge_type;
+static struct irq_chip ioapic_chip;
 
 #define IOAPIC_AUTO	-1
 #define IOAPIC_EDGE	0
@@ -1248,16 +1327,16 @@ static struct hw_interrupt_type ioapic_e
 
 static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 {
-	unsigned idx;
-
-	idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
-
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 			trigger == IOAPIC_LEVEL)
-		irq_desc[idx].chip = &ioapic_level_type;
-	else
-		irq_desc[idx].chip = &ioapic_edge_type;
-	set_intr_gate(vector, interrupt[idx]);
+		set_irq_chip_and_handler_name(irq, &ioapic_chip,
+					 handle_fasteoi_irq, "fasteoi");
+	else {
+		irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
+		set_irq_chip_and_handler_name(irq, &ioapic_chip,
+					 handle_edge_irq, "edge");
+	}
+	set_intr_gate(vector, interrupt[irq]);
 }
 #else
 #define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
@@ -1328,9 +1407,8 @@ static void __init setup_IO_APIC_irqs(vo
 			if (!apic && (irq < 16))
 				disable_8259A_irq(irq);
 		}
+		ioapic_write_entry(apic, pin, entry);
 		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
-		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
 		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
@@ -1347,7 +1425,6 @@ static void __init setup_IO_APIC_irqs(vo
 static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
 {
 	struct IO_APIC_route_entry entry;
-	unsigned long flags;
 
 	memset(&entry,0,sizeof(entry));
 
@@ -1372,15 +1449,13 @@ static void __init setup_ExtINT_IRQ0_pin
 	 * The timer IRQ doesn't have to know that behind the
 	 * scene we have a 8259A-master in AEOI mode ...
 	 */
-	irq_desc[0].chip = &ioapic_edge_type;
+	irq_desc[0].chip = &ioapic_chip;
+	set_irq_handler(0, handle_edge_irq);
 
 	/*
 	 * Add it to the IO-APIC irq-routing table:
 	 */
-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
-	io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_write_entry(apic, pin, entry);
 
 	enable_8259A_irq(0);
 }
@@ -1490,10 +1565,7 @@ void __init print_IO_APIC(void)
 	for (i = 0; i <= reg_01.bits.entries; i++) {
 		struct IO_APIC_route_entry entry;
 
-		spin_lock_irqsave(&ioapic_lock, flags);
-		*(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
-		*(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
+		entry = ioapic_read_entry(apic, i);
 
 		printk(KERN_DEBUG " %02x %03X %02X  ",
 			i,
@@ -1513,17 +1585,12 @@ void __init print_IO_APIC(void)
 		);
 	}
 	}
-	if (use_pci_vector())
-		printk(KERN_INFO "Using vector-based indexing\n");
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	for (i = 0; i < NR_IRQS; i++) {
 		struct irq_pin_list *entry = irq_2_pin + i;
 		if (entry->pin < 0)
 			continue;
- 		if (use_pci_vector() && !platform_legacy_irq(i))
-			printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
-		else
-			printk(KERN_DEBUG "IRQ%d ", i);
+		printk(KERN_DEBUG "IRQ%d ", i);
 		for (;;) {
 			printk("-> %d:%d", entry->apic, entry->pin);
 			if (!entry->next)
@@ -1709,10 +1776,7 @@ static void __init enable_IO_APIC(void)
 		/* See if any of the pins is in ExtINT mode */
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
 			struct IO_APIC_route_entry entry;
-			spin_lock_irqsave(&ioapic_lock, flags);
-			*(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
-			*(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
-			spin_unlock_irqrestore(&ioapic_lock, flags);
+			entry = ioapic_read_entry(apic, pin);
 
 
 			/* If the interrupt line is enabled and in ExtInt mode
@@ -1770,7 +1834,6 @@ void disable_IO_APIC(void)
 	 */
 	if (ioapic_i8259.pin != -1) {
 		struct IO_APIC_route_entry entry;
-		unsigned long flags;
 
 		memset(&entry, 0, sizeof(entry));
 		entry.mask            = 0; /* Enabled */
@@ -1787,12 +1850,7 @@ void disable_IO_APIC(void)
 		/*
 		 * Add it to the IO-APIC irq-routing table:
 		 */
-		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
-			*(((int *)&entry)+1));
-		io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
-			*(((int *)&entry)+0));
-		spin_unlock_irqrestore(&ioapic_lock, flags);
+		ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
 	}
 	disconnect_bsp_APIC(ioapic_i8259.pin != -1);
 #endif
@@ -1959,6 +2017,8 @@ static int __init timer_irq_works(void)
  */
 
 /*
+ * Startup quirk:
+ *
  * Starting up a edge-triggered IO-APIC interrupt is
  * nasty - we need to make sure that we get the edge.
  * If it is already asserted for some reason, we need
@@ -1966,8 +2026,10 @@ static int __init timer_irq_works(void)
  *
  * This is not complete - we should be able to fake
  * an edge even if it isn't on the 8259A...
+ *
+ * (We do this for level-triggered IRQs too - it cannot hurt.)
  */
-static unsigned int startup_edge_ioapic_irq(unsigned int irq)
+static unsigned int startup_ioapic_irq(unsigned int irq)
 {
 	int was_pending = 0;
 	unsigned long flags;
@@ -1984,47 +2046,18 @@ static unsigned int startup_edge_ioapic_
 	return was_pending;
 }
 
-/*
- * Once we have recorded IRQ_PENDING already, we can mask the
- * interrupt for real. This prevents IRQ storms from unhandled
- * devices.
- */
-static void ack_edge_ioapic_irq(unsigned int irq)
-{
-	move_irq(irq);
-	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
-					== (IRQ_PENDING | IRQ_DISABLED))
-		mask_IO_APIC_irq(irq);
-	ack_APIC_irq();
-}
-
-/*
- * Level triggered interrupts can just be masked,
- * and shutting down and starting up the interrupt
- * is the same as enabling and disabling them -- except
- * with a startup need to return a "was pending" value.
- *
- * Level triggered interrupts are special because we
- * do not touch any IO-APIC register while handling
- * them. We ack the APIC in the end-IRQ handler, not
- * in the start-IRQ-handler. Protection against reentrance
- * from the same interrupt is still provided, both by the
- * generic IRQ layer and by the fact that an unacked local
- * APIC does not accept IRQs.
- */
-static unsigned int startup_level_ioapic_irq (unsigned int irq)
+static void ack_ioapic_irq(unsigned int irq)
 {
-	unmask_IO_APIC_irq(irq);
-
-	return 0; /* don't check for pending */
+	move_native_irq(irq);
+	ack_APIC_irq();
 }
 
-static void end_level_ioapic_irq (unsigned int irq)
+static void ack_ioapic_quirk_irq(unsigned int irq)
 {
 	unsigned long v;
 	int i;
 
-	move_irq(irq);
+	move_native_irq(irq);
 /*
  * It appears there is an erratum which affects at least version 0x11
  * of I/O APIC (that's the 82093AA and cores integrated into various
@@ -2044,7 +2077,7 @@ static void end_level_ioapic_irq (unsign
  * operation to prevent an edge-triggered interrupt escaping meanwhile.
  * The idea is from Manfred Spraul.  --macro
  */
-	i = IO_APIC_VECTOR(irq);
+	i = irq_vector[irq];
 
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 
@@ -2059,104 +2092,24 @@ static void end_level_ioapic_irq (unsign
 	}
 }
 
-#ifdef CONFIG_PCI_MSI
-static unsigned int startup_edge_ioapic_vector(unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	return startup_edge_ioapic_irq(irq);
-}
-
-static void ack_edge_ioapic_vector(unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	move_native_irq(vector);
-	ack_edge_ioapic_irq(irq);
-}
-
-static unsigned int startup_level_ioapic_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	return startup_level_ioapic_irq (irq);
-}
-
-static void end_level_ioapic_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	move_native_irq(vector);
-	end_level_ioapic_irq(irq);
-}
-
-static void mask_IO_APIC_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	mask_IO_APIC_irq(irq);
-}
-
-static void unmask_IO_APIC_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	unmask_IO_APIC_irq(irq);
-}
-
-#ifdef CONFIG_SMP
-static void set_ioapic_affinity_vector (unsigned int vector,
-					cpumask_t cpu_mask)
-{
-	int irq = vector_to_irq(vector);
-
-	set_native_irq_info(vector, cpu_mask);
-	set_ioapic_affinity_irq(irq, cpu_mask);
-}
-#endif
-#endif
-
-static int ioapic_retrigger(unsigned int irq)
+static int ioapic_retrigger_irq(unsigned int irq)
 {
-	send_IPI_self(IO_APIC_VECTOR(irq));
+	send_IPI_self(irq_vector[irq]);
 
 	return 1;
 }
 
-/*
- * Level and edge triggered IO-APIC interrupts need different handling,
- * so we use two separate IRQ descriptors. Edge triggered IRQs can be
- * handled with the level-triggered descriptor, but that one has slightly
- * more overhead. Level-triggered interrupts cannot be handled with the
- * edge-triggered handler, without risking IRQ storms and other ugly
- * races.
- */
-static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
-	.typename 	= "IO-APIC-edge",
-	.startup 	= startup_edge_ioapic,
-	.shutdown 	= shutdown_edge_ioapic,
-	.enable 	= enable_edge_ioapic,
-	.disable 	= disable_edge_ioapic,
-	.ack 		= ack_edge_ioapic,
-	.end 		= end_edge_ioapic,
-#ifdef CONFIG_SMP
-	.set_affinity 	= set_ioapic_affinity,
-#endif
-	.retrigger	= ioapic_retrigger,
-};
-
-static struct hw_interrupt_type ioapic_level_type __read_mostly = {
-	.typename 	= "IO-APIC-level",
-	.startup 	= startup_level_ioapic,
-	.shutdown 	= shutdown_level_ioapic,
-	.enable 	= enable_level_ioapic,
-	.disable 	= disable_level_ioapic,
-	.ack 		= mask_and_ack_level_ioapic,
-	.end 		= end_level_ioapic,
+static struct irq_chip ioapic_chip __read_mostly = {
+	.name 		= "IO-APIC",
+	.startup 	= startup_ioapic_irq,
+	.mask	 	= mask_IO_APIC_irq,
+	.unmask	 	= unmask_IO_APIC_irq,
+	.ack 		= ack_ioapic_irq,
+	.eoi 		= ack_ioapic_quirk_irq,
 #ifdef CONFIG_SMP
-	.set_affinity 	= set_ioapic_affinity,
+	.set_affinity 	= set_ioapic_affinity_irq,
 #endif
-	.retrigger	= ioapic_retrigger,
+	.retrigger	= ioapic_retrigger_irq,
 };
 #endif /* !CONFIG_XEN */
 
@@ -2177,12 +2130,7 @@ static inline void init_IO_APIC_traps(vo
 	 */
 	for (irq = 0; irq < NR_IRQS ; irq++) {
 		int tmp = irq;
-		if (use_pci_vector()) {
-			if (!platform_legacy_irq(tmp))
-				if ((tmp = vector_to_irq(tmp)) == -1)
-					continue;
-		}
-		if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
+		if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
@@ -2193,22 +2141,23 @@ static inline void init_IO_APIC_traps(vo
 #ifndef CONFIG_XEN
 			else
 				/* Strange. Oh, well.. */
-				irq_desc[irq].chip = &no_irq_type;
+				irq_desc[irq].chip = &no_irq_chip;
 #endif
 		}
 	}
 }
 
 #ifndef CONFIG_XEN
-static void enable_lapic_irq (unsigned int irq)
-{
-	unsigned long v;
+/*
+ * The local APIC irq-chip implementation:
+ */
 
-	v = apic_read(APIC_LVT0);
-	apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+static void ack_apic(unsigned int irq)
+{
+	ack_APIC_irq();
 }
 
-static void disable_lapic_irq (unsigned int irq)
+static void mask_lapic_irq (unsigned int irq)
 {
 	unsigned long v;
 
@@ -2216,21 +2165,19 @@ static void disable_lapic_irq (unsigned 
 	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 
-static void ack_lapic_irq (unsigned int irq)
+static void unmask_lapic_irq (unsigned int irq)
 {
-	ack_APIC_irq();
-}
+	unsigned long v;
 
-static void end_lapic_irq (unsigned int i) { /* nothing */ }
+	v = apic_read(APIC_LVT0);
+	apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
 
-static struct hw_interrupt_type lapic_irq_type __read_mostly = {
-	.typename 	= "local-APIC-edge",
-	.startup 	= NULL, /* startup_irq() not used for IRQ0 */
-	.shutdown 	= NULL, /* shutdown_irq() not used for IRQ0 */
-	.enable 	= enable_lapic_irq,
-	.disable 	= disable_lapic_irq,
-	.ack 		= ack_lapic_irq,
-	.end 		= end_lapic_irq
+static struct irq_chip lapic_chip __read_mostly = {
+	.name		= "local-APIC-edge",
+	.mask		= mask_lapic_irq,
+	.unmask		= unmask_lapic_irq,
+	.eoi		= ack_apic,
 };
 
 static void setup_nmi (void)
@@ -2263,17 +2210,13 @@ static inline void unlock_ExtINT_logic(v
 	int apic, pin, i;
 	struct IO_APIC_route_entry entry0, entry1;
 	unsigned char save_control, save_freq_select;
-	unsigned long flags;
 
 	pin  = find_isa_irq_pin(8, mp_INT);
 	apic = find_isa_irq_apic(8, mp_INT);
 	if (pin == -1)
 		return;
 
-	spin_lock_irqsave(&ioapic_lock, flags);
-	*(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
-	*(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	entry0 = ioapic_read_entry(apic, pin);
 	clear_IO_APIC_pin(apic, pin);
 
 	memset(&entry1, 0, sizeof(entry1));
@@ -2286,10 +2229,7 @@ static inline void unlock_ExtINT_logic(v
 	entry1.trigger = 0;
 	entry1.vector = 0;
 
-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
-	io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_write_entry(apic, pin, entry1);
 
 	save_control = CMOS_READ(RTC_CONTROL);
 	save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
@@ -2308,10 +2248,7 @@ static inline void unlock_ExtINT_logic(v
 	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
 	clear_IO_APIC_pin(apic, pin);
 
-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
-	io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_write_entry(apic, pin, entry0);
 }
 
 int timer_uses_ioapic_pin_0;
@@ -2411,7 +2348,8 @@ static inline void check_timer(void)
 	printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
 
 	disable_8259A_irq(0);
-	irq_desc[0].chip = &lapic_irq_type;
+	set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
+				      "fasteio");
 	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
@@ -2523,17 +2461,12 @@ static int ioapic_suspend(struct sys_dev
 {
 	struct IO_APIC_route_entry *entry;
 	struct sysfs_ioapic_data *data;
-	unsigned long flags;
 	int i;
 	
 	data = container_of(dev, struct sysfs_ioapic_data, dev);
 	entry = data->entry;
-	spin_lock_irqsave(&ioapic_lock, flags);
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
-		*(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
-		*(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
-	}
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+		entry[i] = ioapic_read_entry(dev->id, i);
 
 	return 0;
 }
@@ -2555,11 +2488,9 @@ static int ioapic_resume(struct sys_devi
 		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
-		io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
-		io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
-	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+		ioapic_write_entry(dev->id, i, entry[i]);
 
 	return 0;
 }
@@ -2605,6 +2536,240 @@ static int __init ioapic_init_sysfs(void
 
 device_initcall(ioapic_init_sysfs);
 
+#ifndef CONFIG_XEN
+/*
+ * Dynamic irq allocate and deallocation
+ */
+int create_irq(void)
+{
+	/* Allocate an unused irq */
+	int irq, new, vector;
+	unsigned long flags;
+
+	irq = -ENOSPC;
+	spin_lock_irqsave(&vector_lock, flags);
+	for (new = (NR_IRQS - 1); new >= 0; new--) {
+		if (platform_legacy_irq(new))
+			continue;
+		if (irq_vector[new] != 0)
+			continue;
+		vector = __assign_irq_vector(new);
+		if (likely(vector > 0))
+			irq = new;
+		break;
+	}
+	spin_unlock_irqrestore(&vector_lock, flags);
+
+	if (irq >= 0) {
+		set_intr_gate(vector, interrupt[irq]);
+		dynamic_irq_init(irq);
+	}
+	return irq;
+}
+
+void destroy_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	dynamic_irq_cleanup(irq);
+
+	spin_lock_irqsave(&vector_lock, flags);
+	irq_vector[irq] = 0;
+	spin_unlock_irqrestore(&vector_lock, flags);
+}
+#endif
+
+/*
+ * MSI mesage composition
+ */
+#ifdef CONFIG_PCI_MSI
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+	int vector;
+	unsigned dest;
+
+	vector = assign_irq_vector(irq);
+	if (vector >= 0) {
+		dest = cpu_mask_to_apicid(TARGET_CPUS);
+
+		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->address_lo =
+			MSI_ADDR_BASE_LO |
+			((INT_DEST_MODE == 0) ?
+				MSI_ADDR_DEST_MODE_PHYSICAL:
+				MSI_ADDR_DEST_MODE_LOGICAL) |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				MSI_ADDR_REDIRECTION_CPU:
+				MSI_ADDR_REDIRECTION_LOWPRI) |
+			MSI_ADDR_DEST_ID(dest);
+
+		msg->data =
+			MSI_DATA_TRIGGER_EDGE |
+			MSI_DATA_LEVEL_ASSERT |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				MSI_DATA_DELIVERY_FIXED:
+				MSI_DATA_DELIVERY_LOWPRI) |
+			MSI_DATA_VECTOR(vector);
+	}
+	return vector;
+}
+
+#ifdef CONFIG_SMP
+static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct msi_msg msg;
+	unsigned int dest;
+	cpumask_t tmp;
+	int vector;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	vector = assign_irq_vector(irq);
+	if (vector < 0)
+		return;
+
+	dest = cpu_mask_to_apicid(mask);
+
+	read_msi_msg(irq, &msg);
+
+	msg.data &= ~MSI_DATA_VECTOR_MASK;
+	msg.data |= MSI_DATA_VECTOR(vector);
+	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+	write_msi_msg(irq, &msg);
+	set_native_irq_info(irq, mask);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+	.name		= "PCI-MSI",
+	.unmask		= unmask_msi_irq,
+	.mask		= mask_msi_irq,
+	.ack		= ack_ioapic_irq,
+#ifdef CONFIG_SMP
+	.set_affinity	= set_msi_irq_affinity,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
+
+int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
+{
+	struct msi_msg msg;
+	int ret;
+	ret = msi_compose_msg(dev, irq, &msg);
+	if (ret < 0)
+		return ret;
+
+	write_msi_msg(irq, &msg);
+
+	set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
+				      "edge");
+
+	return 0;
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+	return;
+}
+
+#endif /* CONFIG_PCI_MSI */
+
+/*
+ * Hypertransport interrupt support
+ */
+#ifdef CONFIG_HT_IRQ
+
+#ifdef CONFIG_SMP
+
+static void target_ht_irq(unsigned int irq, unsigned int dest)
+{
+	struct ht_irq_msg msg;
+	fetch_ht_irq_msg(irq, &msg);
+
+	msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
+	msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+
+	msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
+	msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+
+	write_ht_irq_msg(irq, &msg);
+}
+
+static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	unsigned int dest;
+	cpumask_t tmp;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(mask, tmp, CPU_MASK_ALL);
+
+	dest = cpu_mask_to_apicid(mask);
+
+	target_ht_irq(irq, dest);
+	set_native_irq_info(irq, mask);
+}
+#endif
+
+static struct irq_chip ht_irq_chip = {
+	.name		= "PCI-HT",
+	.mask		= mask_ht_irq,
+	.unmask		= unmask_ht_irq,
+	.ack		= ack_ioapic_irq,
+#ifdef CONFIG_SMP
+	.set_affinity	= set_ht_irq_affinity,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
+
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+	int vector;
+
+	vector = assign_irq_vector(irq);
+	if (vector >= 0) {
+		struct ht_irq_msg msg;
+		unsigned dest;
+		cpumask_t tmp;
+
+		cpus_clear(tmp);
+		cpu_set(vector >> 8, tmp);
+		dest = cpu_mask_to_apicid(tmp);
+
+		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+		msg.address_lo =
+			HT_IRQ_LOW_BASE |
+			HT_IRQ_LOW_DEST_ID(dest) |
+			HT_IRQ_LOW_VECTOR(vector) |
+			((INT_DEST_MODE == 0) ?
+				HT_IRQ_LOW_DM_PHYSICAL :
+				HT_IRQ_LOW_DM_LOGICAL) |
+			HT_IRQ_LOW_RQEOI_EDGE |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				HT_IRQ_LOW_MT_FIXED :
+				HT_IRQ_LOW_MT_ARBITRATED) |
+			HT_IRQ_LOW_IRQ_MASKED;
+
+		write_ht_irq_msg(irq, &msg);
+
+		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
+					      handle_edge_irq, "edge");
+	}
+	return vector;
+}
+#endif /* CONFIG_HT_IRQ */
+
 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
@@ -2758,13 +2923,34 @@ int io_apic_set_pci_routing (int ioapic,
 	if (!ioapic && (irq < 16))
 		disable_8259A_irq(irq);
 
+	ioapic_write_entry(ioapic, pin, entry);
 	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
-	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
-	set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
+	set_native_irq_info(irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
 }
 
 #endif /* CONFIG_ACPI */
+
+static int __init parse_disable_timer_pin_1(char *arg)
+{
+	disable_timer_pin_1 = 1;
+	return 0;
+}
+early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
+
+static int __init parse_enable_timer_pin_1(char *arg)
+{
+	disable_timer_pin_1 = -1;
+	return 0;
+}
+early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
+
+static int __init parse_noapic(char *arg)
+{
+	/* disable IO-APIC */
+	disable_ioapic_setup();
+	return 0;
+}
+early_param("noapic", parse_noapic);
Index: head-2008-07-15/arch/x86/kernel/irq_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/irq_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/irq_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -53,8 +53,10 @@ static union irq_ctx *softirq_ctx[NR_CPU
  */
 fastcall unsigned int do_IRQ(struct pt_regs *regs)
 {	
+	struct pt_regs *old_regs;
 	/* high bit used in ret_from_ code */
 	int irq = ~regs->orig_eax;
+	struct irq_desc *desc = irq_desc + irq;
 #ifdef CONFIG_4KSTACKS
 	union irq_ctx *curctx, *irqctx;
 	u32 *isp;
@@ -66,6 +68,7 @@ fastcall unsigned int do_IRQ(struct pt_r
 		BUG();
 	}
 
+	old_regs = set_irq_regs(regs);
 	irq_enter();
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
@@ -110,19 +113,20 @@ fastcall unsigned int do_IRQ(struct pt_r
 			(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
 
 		asm volatile(
-			"       xchgl   %%ebx,%%esp      \n"
-			"       call    __do_IRQ         \n"
+			"       xchgl  %%ebx,%%esp      \n"
+			"       call   *%%edi           \n"
 			"       movl   %%ebx,%%esp      \n"
 			: "=a" (arg1), "=d" (arg2), "=b" (ebx)
-			:  "0" (irq),   "1" (regs),  "2" (isp)
-			: "memory", "cc", "ecx"
+			:  "0" (irq),   "1" (desc),  "2" (isp),
+			   "D" (desc->handle_irq)
+			: "memory", "cc"
 		);
 	} else
 #endif
-		__do_IRQ(irq, regs);
+		desc->handle_irq(irq, desc);
 
 	irq_exit();
-
+	set_irq_regs(old_regs);
 	return 1;
 }
 
@@ -253,7 +257,8 @@ int show_interrupts(struct seq_file *p, 
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].chip->typename);
+		seq_printf(p, " %8s", irq_desc[i].chip->name);
+		seq_printf(p, "-%-8s", irq_desc[i].name);
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
Index: head-2008-07-15/arch/x86/kernel/ldt_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/ldt_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/ldt_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -1,5 +1,5 @@
 /*
- * linux/kernel/ldt.c
+ * linux/arch/i386/kernel/ldt.c
  *
  * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
  * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
Index: head-2008-07-15/arch/x86/kernel/microcode-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/microcode-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/microcode-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -2,6 +2,7 @@
  *	Intel CPU Microcode Update Driver for Linux
  *
  *	Copyright (C) 2000-2004 Tigran Aivazian
+ *		      2006	Shaohua Li <shaohua.li@intel.com>
  *
  *	This driver allows to upgrade microcode on Intel processors
  *	belonging to IA-32 family - PentiumPro, Pentium II, 
@@ -33,7 +34,9 @@
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
-#include <linux/syscalls.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
 
 #include <asm/msr.h>
 #include <asm/uaccess.h>
@@ -55,12 +58,7 @@ module_param(verbose, int, 0644);
 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
 static DEFINE_MUTEX(microcode_mutex);
 				
-static int microcode_open (struct inode *unused1, struct file *unused2)
-{
-	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
-}
-
-
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
 static int do_microcode_update (const void __user *ubuf, size_t len)
 {
 	int err;
@@ -85,6 +83,11 @@ static int do_microcode_update (const vo
 	return err;
 }
 
+static int microcode_open (struct inode *unused1, struct file *unused2)
+{
+	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
 static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
 {
 	ssize_t ret;
@@ -117,7 +120,7 @@ static struct miscdevice microcode_dev =
 	.fops		= &microcode_fops,
 };
 
-static int __init microcode_init (void)
+static int __init microcode_dev_init (void)
 {
 	int error;
 
@@ -129,6 +132,68 @@ static int __init microcode_init (void)
 		return error;
 	}
 
+	return 0;
+}
+
+static void __exit microcode_dev_exit (void)
+{
+	misc_deregister(&microcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while(0)
+#endif
+
+/* fake device for request_firmware */
+static struct platform_device *microcode_pdev;
+
+static int request_microcode(void)
+{
+	char name[30];
+	const struct cpuinfo_x86 *c = &boot_cpu_data;
+	const struct firmware *firmware;
+	int error;
+	struct xen_platform_op op;
+
+	sprintf(name,"intel-ucode/%02x-%02x-%02x",
+		c->x86, c->x86_model, c->x86_mask);
+	error = request_firmware(&firmware, name, &microcode_pdev->dev);
+	if (error) {
+		pr_debug("ucode data file %s load failed\n", name);
+		return error;
+	}
+
+	op.cmd = XENPF_microcode_update;
+	set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data);
+	op.u.microcode.length = firmware->size;
+	error = HYPERVISOR_platform_op(&op);
+
+	release_firmware(firmware);
+
+	if (error)
+		pr_debug("ucode load failed\n");
+
+	return error;
+}
+
+static int __init microcode_init (void)
+{
+	int error;
+
+	error = microcode_dev_init();
+	if (error)
+		return error;
+	microcode_pdev = platform_device_register_simple("microcode", -1,
+							 NULL, 0);
+	if (IS_ERR(microcode_pdev)) {
+		microcode_dev_exit();
+		return PTR_ERR(microcode_pdev);
+	}
+
+	request_microcode();
+
 	printk(KERN_INFO 
 		"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
 	return 0;
@@ -136,9 +201,9 @@ static int __init microcode_init (void)
 
 static void __exit microcode_exit (void)
 {
-	misc_deregister(&microcode_dev);
+	microcode_dev_exit();
+	platform_device_unregister(microcode_pdev);
 }
 
 module_init(microcode_init)
 module_exit(microcode_exit)
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
Index: head-2008-07-15/arch/x86/kernel/mpparse_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/mpparse_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/mpparse_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -30,6 +30,7 @@
 #include <asm/io_apic.h>
 
 #include <mach_apic.h>
+#include <mach_apicdef.h>
 #include <mach_mpparse.h>
 #include <bios_ebda.h>
 
@@ -68,7 +69,7 @@ unsigned int def_to_bigsmp = 0;
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
 /* Internal processor count */
-static unsigned int __devinitdata num_processors;
+unsigned int __cpuinitdata num_processors;
 
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map;
@@ -235,12 +236,14 @@ static void __init MP_bus_info (struct m
 
 	mpc_oem_bus_info(m, str, translation_table[mpc_record]);
 
+#if MAX_MP_BUSSES < 256
 	if (m->mpc_busid >= MAX_MP_BUSSES) {
 		printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
 			" is too large, max. supported is %d\n",
 			m->mpc_busid, str, MAX_MP_BUSSES - 1);
 		return;
 	}
+#endif
 
 	if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
@@ -300,19 +303,6 @@ static void __init MP_lintsrc_info (stru
 			m->mpc_irqtype, m->mpc_irqflag & 3,
 			(m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
 			m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
-	/*
-	 * Well it seems all SMP boards in existence
-	 * use ExtINT/LVT1 == LINT0 and
-	 * NMI/LVT2 == LINT1 - the following check
-	 * will show us if this assumptions is false.
-	 * Until then we do not have to add baggage.
-	 */
-	if ((m->mpc_irqtype == mp_ExtINT) &&
-		(m->mpc_destapiclint != 0))
-			BUG();
-	if ((m->mpc_irqtype == mp_NMI) &&
-		(m->mpc_destapiclint != 1))
-			BUG();
 }
 
 #ifdef CONFIG_X86_NUMAQ
@@ -838,8 +828,7 @@ int es7000_plat;
 
 #ifdef CONFIG_ACPI
 
-void __init mp_register_lapic_address (
-	u64			address)
+void __init mp_register_lapic_address(u64 address)
 {
 #ifndef CONFIG_XEN
 	mp_lapic_addr = (unsigned long) address;
@@ -853,13 +842,10 @@ void __init mp_register_lapic_address (
 #endif
 }
 
-
-void __devinit mp_register_lapic (
-	u8			id, 
-	u8			enabled)
+void __devinit mp_register_lapic (u8 id, u8 enabled)
 {
 	struct mpc_config_processor processor;
-	int			boot_cpu = 0;
+	int boot_cpu = 0;
 	
 	if (MAX_APICS - id <= 0) {
 		printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
@@ -898,11 +884,9 @@ static struct mp_ioapic_routing {
 	u32			pin_programmed[4];
 } mp_ioapic_routing[MAX_IO_APICS];
 
-
-static int mp_find_ioapic (
-	int			gsi)
+static int mp_find_ioapic (int gsi)
 {
-	int			i = 0;
+	int i = 0;
 
 	/* Find the IOAPIC that manages this GSI. */
 	for (i = 0; i < nr_ioapics; i++) {
@@ -915,15 +899,11 @@ static int mp_find_ioapic (
 
 	return -1;
 }
-	
 
-void __init mp_register_ioapic (
-	u8			id, 
-	u32			address,
-	u32			gsi_base)
+void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
 {
-	int			idx = 0;
-	int			tmpid;
+	int idx = 0;
+	int tmpid;
 
 	if (nr_ioapics >= MAX_IO_APICS) {
 		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
@@ -971,16 +951,10 @@ void __init mp_register_ioapic (
 		mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
 		mp_ioapic_routing[idx].gsi_base,
 		mp_ioapic_routing[idx].gsi_end);
-
-	return;
 }
 
-
-void __init mp_override_legacy_irq (
-	u8			bus_irq,
-	u8			polarity, 
-	u8			trigger, 
-	u32			gsi)
+void __init
+mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 {
 	struct mpc_config_intsrc intsrc;
 	int			ioapic = -1;
@@ -1018,15 +992,13 @@ void __init mp_override_legacy_irq (
 	mp_irqs[mp_irq_entries] = intsrc;
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!\n");
-
-	return;
 }
 
 void __init mp_config_acpi_legacy_irqs (void)
 {
 	struct mpc_config_intsrc intsrc;
-	int			i = 0;
-	int			ioapic = -1;
+	int i = 0;
+	int ioapic = -1;
 
 	/* 
 	 * Fabricate the legacy ISA bus (bus #31).
@@ -1095,12 +1067,12 @@ void __init mp_config_acpi_legacy_irqs (
 
 #define MAX_GSI_NUM	4096
 
-int mp_register_gsi (u32 gsi, int triggering, int polarity)
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
 {
-	int			ioapic = -1;
-	int			ioapic_pin = 0;
-	int			idx, bit = 0;
-	static int		pci_irq = 16;
+	int ioapic = -1;
+	int ioapic_pin = 0;
+	int idx, bit = 0;
+	static int pci_irq = 16;
 	/*
 	 * Mapping between Global System Interrups, which
 	 * represent all possible interrupts, and IRQs
Index: head-2008-07-15/arch/x86/kernel/pci-dma-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/pci-dma-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/pci-dma-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -116,8 +116,7 @@ dma_map_sg(struct device *hwdev, struct 
 {
 	int i, rc;
 
-	if (direction == DMA_NONE)
-		BUG();
+	BUG_ON(!valid_dma_direction(direction));
 	WARN_ON(nents == 0 || sg[0].length == 0);
 
 	if (swiotlb) {
@@ -148,7 +147,7 @@ dma_unmap_sg(struct device *hwdev, struc
 {
 	int i;
 
-	BUG_ON(direction == DMA_NONE);
+	BUG_ON(!valid_dma_direction(direction));
 	if (swiotlb)
 		swiotlb_unmap_sg(hwdev, sg, nents, direction);
 	else {
@@ -165,8 +164,7 @@ dma_map_page(struct device *dev, struct 
 {
 	dma_addr_t dma_addr;
 
-	BUG_ON(direction == DMA_NONE);
-
+	BUG_ON(!valid_dma_direction(direction));
 	if (swiotlb) {
 		dma_addr = swiotlb_map_page(
 			dev, page, offset, size, direction);
@@ -183,7 +181,7 @@ void
 dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
 	       enum dma_data_direction direction)
 {
-	BUG_ON(direction == DMA_NONE);
+	BUG_ON(!valid_dma_direction(direction));
 	if (swiotlb)
 		swiotlb_unmap_page(dev, dma_address, size, direction);
 	else
@@ -365,8 +363,7 @@ dma_map_single(struct device *dev, void 
 {
 	dma_addr_t dma;
 
-	if (direction == DMA_NONE)
-		BUG();
+	BUG_ON(!valid_dma_direction(direction));
 	WARN_ON(size == 0);
 
 	if (swiotlb) {
@@ -387,8 +384,7 @@ void
 dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
 		 enum dma_data_direction direction)
 {
-	if (direction == DMA_NONE)
-		BUG();
+	BUG_ON(!valid_dma_direction(direction));
 	if (swiotlb)
 		swiotlb_unmap_single(dev, dma_addr, size, direction);
 	else
Index: head-2008-07-15/arch/x86/kernel/process_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/process_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/process_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -37,6 +37,7 @@
 #include <linux/kallsyms.h>
 #include <linux/ptrace.h>
 #include <linux/random.h>
+#include <linux/personality.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -186,7 +187,7 @@ void cpu_idle(void)
 void cpu_idle_wait(void)
 {
 	unsigned int cpu, this_cpu = get_cpu();
-	cpumask_t map;
+	cpumask_t map, tmp = current->cpus_allowed;
 
 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
 	put_cpu();
@@ -208,6 +209,8 @@ void cpu_idle_wait(void)
 		}
 		cpus_and(map, map, cpu_online_map);
 	} while (!cpus_empty(map));
+
+	set_cpus_allowed(current, tmp);
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
@@ -240,9 +243,9 @@ void show_regs(struct pt_regs * regs)
 	if (user_mode_vm(regs))
 		printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
 	printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
-	       regs->eflags, print_tainted(), system_utsname.release,
-	       (int)strcspn(system_utsname.version, " "),
-	       system_utsname.version);
+	       regs->eflags, print_tainted(), init_utsname()->release,
+	       (int)strcspn(init_utsname()->version, " "),
+	       init_utsname()->version);
 	printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
 		regs->eax,regs->ebx,regs->ecx,regs->edx);
 	printk("ESI: %08lx EDI: %08lx EBP: %08lx",
@@ -264,15 +267,6 @@ void show_regs(struct pt_regs * regs)
  * the "args".
  */
 extern void kernel_thread_helper(void);
-__asm__(".section .text\n"
-	".align 4\n"
-	"kernel_thread_helper:\n\t"
-	"movl %edx,%eax\n\t"
-	"pushl %edx\n\t"
-	"call *%ebx\n\t"
-	"pushl %eax\n\t"
-	"call do_exit\n"
-	".previous");
 
 /*
  * Create a kernel thread
@@ -290,7 +284,7 @@ int kernel_thread(int (*fn)(void *), voi
 	regs.xes = __USER_DS;
 	regs.orig_eax = -1;
 	regs.eip = (unsigned long) kernel_thread_helper;
-	regs.xcs = GET_KERNEL_CS();
+	regs.xcs = __KERNEL_CS | get_kernel_rpl();
 	regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
 
 	/* Ok, create the new process.. */
@@ -369,13 +363,12 @@ int copy_thread(int nr, unsigned long cl
 
 	tsk = current;
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
-		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
+						IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
 			p->thread.io_bitmap_max = 0;
 			return -ENOMEM;
 		}
-		memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
-			IO_BITMAP_BYTES);
 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
 	}
 
@@ -850,7 +843,7 @@ asmlinkage int sys_get_thread_area(struc
 
 unsigned long arch_align_stack(unsigned long sp)
 {
-	if (randomize_va_space)
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
 		sp -= get_random_int() % 8192;
 	return sp & ~0xf;
 }
Index: head-2008-07-15/arch/x86/kernel/setup_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/setup_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/setup_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -56,6 +56,7 @@
 #include <asm/apic.h>
 #include <asm/e820.h>
 #include <asm/mpspec.h>
+#include <asm/mmzone.h>
 #include <asm/setup.h>
 #include <asm/arch_hooks.h>
 #include <asm/sections.h>
@@ -105,18 +106,6 @@ EXPORT_SYMBOL(boot_cpu_data);
 
 unsigned long mmu_cr4_features;
 
-#ifdef	CONFIG_ACPI
-	int acpi_disabled = 0;
-#else
-	int acpi_disabled = 1;
-#endif
-EXPORT_SYMBOL(acpi_disabled);
-
-#ifdef	CONFIG_ACPI
-int __initdata acpi_force = 0;
-extern acpi_interrupt_flags	acpi_sci_flags;
-#endif
-
 /* for MCA, but anyone else can use it if they want */
 unsigned int machine_id;
 #ifdef CONFIG_MCA
@@ -170,7 +159,6 @@ struct e820map machine_e820;
 #endif
 
 extern void early_cpu_init(void);
-extern void generic_apic_probe(char *);
 extern int root_mountflags;
 
 unsigned long saved_videomode;
@@ -243,9 +231,6 @@ static struct resource adapter_rom_resou
 	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
 } };
 
-#define ADAPTER_ROM_RESOURCES \
-	(sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
-
 static struct resource video_rom_resource = {
 	.name 	= "Video ROM",
 	.start	= 0xc0000,
@@ -307,9 +292,6 @@ static struct resource standard_io_resou
 	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
 } };
 
-#define STANDARD_IO_RESOURCES \
-	(sizeof standard_io_resources / sizeof standard_io_resources[0])
-
 #define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
 
 static int __init romchecksum(unsigned char *rom, unsigned long length)
@@ -372,7 +354,7 @@ static void __init probe_roms(void)
 	}
 
 	/* check for adapter roms on 2k boundaries */
-	for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
+	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
 		rom = isa_bus_to_virt(start);
 		if (!romsignature(rom))
 			continue;
@@ -779,246 +761,152 @@ static inline void copy_edd(void)
 }
 #endif
 
-static void __init parse_cmdline_early (char ** cmdline_p)
+static int __initdata user_defined_memmap = 0;
+
+/*
+ * "mem=nopentium" disables the 4MB page tables.
+ * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
+ * to <mem>, overriding the bios size.
+ * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
+ * <start> to <start>+<mem>, overriding the bios size.
+ *
+ * HPA tells me bootloaders need to parse mem=, so no new
+ * option should be mem=  [also see Documentation/i386/boot.txt]
+ */
+static int __init parse_mem(char *arg)
 {
-	char c = ' ', *to = command_line, *from = saved_command_line;
-	int len = 0, max_cmdline;
-	int userdef = 0;
-
-	if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
-		max_cmdline = COMMAND_LINE_SIZE;
-	memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
-	/* Save unparsed command line copy for /proc/cmdline */
-	saved_command_line[max_cmdline-1] = '\0';
-
-	for (;;) {
-		if (c != ' ')
-			goto next_char;
-		/*
-		 * "mem=nopentium" disables the 4MB page tables.
-		 * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
-		 * to <mem>, overriding the bios size.
-		 * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
-		 * <start> to <start>+<mem>, overriding the bios size.
-		 *
-		 * HPA tells me bootloaders need to parse mem=, so no new
-		 * option should be mem=  [also see Documentation/i386/boot.txt]
-		 */
-		if (!memcmp(from, "mem=", 4)) {
-			if (to != command_line)
-				to--;
-			if (!memcmp(from+4, "nopentium", 9)) {
-				from += 9+4;
-				clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
-				disable_pse = 1;
-			} else {
-				/* If the user specifies memory size, we
-				 * limit the BIOS-provided memory map to
-				 * that size. exactmap can be used to specify
-				 * the exact map. mem=number can be used to
-				 * trim the existing memory map.
-				 */
-				unsigned long long mem_size;
- 
-				mem_size = memparse(from+4, &from);
-				limit_regions(mem_size);
-				userdef=1;
-			}
-		}
+	if (!arg)
+		return -EINVAL;
 
-		else if (!memcmp(from, "memmap=", 7)) {
-			if (to != command_line)
-				to--;
-			if (!memcmp(from+7, "exactmap", 8)) {
-#ifdef CONFIG_CRASH_DUMP
-				/* If we are doing a crash dump, we
-				 * still need to know the real mem
-				 * size before original memory map is
-				 * reset.
-				 */
-				find_max_pfn();
-				saved_max_pfn = max_pfn;
-#endif
-				from += 8+7;
-				e820.nr_map = 0;
-				userdef = 1;
-			} else {
-				/* If the user specifies memory size, we
-				 * limit the BIOS-provided memory map to
-				 * that size. exactmap can be used to specify
-				 * the exact map. mem=number can be used to
-				 * trim the existing memory map.
-				 */
-				unsigned long long start_at, mem_size;
+	if (strcmp(arg, "nopentium") == 0) {
+		clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+		disable_pse = 1;
+	} else {
+		/* If the user specifies memory size, we
+		 * limit the BIOS-provided memory map to
+		 * that size. exactmap can be used to specify
+		 * the exact map. mem=number can be used to
+		 * trim the existing memory map.
+		 */
+		unsigned long long mem_size;
  
-				mem_size = memparse(from+7, &from);
-				if (*from == '@') {
-					start_at = memparse(from+1, &from);
-					add_memory_region(start_at, mem_size, E820_RAM);
-				} else if (*from == '#') {
-					start_at = memparse(from+1, &from);
-					add_memory_region(start_at, mem_size, E820_ACPI);
-				} else if (*from == '$') {
-					start_at = memparse(from+1, &from);
-					add_memory_region(start_at, mem_size, E820_RESERVED);
-				} else {
-					limit_regions(mem_size);
-					userdef=1;
-				}
-			}
-		}
-
-		else if (!memcmp(from, "noexec=", 7))
-			noexec_setup(from + 7);
+		mem_size = memparse(arg, &arg);
+		limit_regions(mem_size);
+		user_defined_memmap = 1;
+	}
+	return 0;
+}
+early_param("mem", parse_mem);
 
+static int __init parse_memmap(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
 
-#ifdef  CONFIG_X86_MPPARSE
-		/*
-		 * If the BIOS enumerates physical processors before logical,
-		 * maxcpus=N at enumeration-time can be used to disable HT.
+	if (strcmp(arg, "exactmap") == 0) {
+#ifdef CONFIG_CRASH_DUMP
+		/* If we are doing a crash dump, we
+		 * still need to know the real mem
+		 * size before original memory map is
+		 * reset.
 		 */
-		else if (!memcmp(from, "maxcpus=", 8)) {
-			extern unsigned int maxcpus;
-
-			maxcpus = simple_strtoul(from + 8, NULL, 0);
-		}
+		find_max_pfn();
+		saved_max_pfn = max_pfn;
 #endif
+		e820.nr_map = 0;
+		user_defined_memmap = 1;
+	} else {
+		/* If the user specifies memory size, we
+		 * limit the BIOS-provided memory map to
+		 * that size. exactmap can be used to specify
+		 * the exact map. mem=number can be used to
+		 * trim the existing memory map.
+		 */
+		unsigned long long start_at, mem_size;
 
-#ifdef CONFIG_ACPI
-		/* "acpi=off" disables both ACPI table parsing and interpreter */
-		else if (!memcmp(from, "acpi=off", 8)) {
-			disable_acpi();
-		}
-
-		/* acpi=force to over-ride black-list */
-		else if (!memcmp(from, "acpi=force", 10)) {
-			acpi_force = 1;
-			acpi_ht = 1;
-			acpi_disabled = 0;
-		}
-
-		/* acpi=strict disables out-of-spec workarounds */
-		else if (!memcmp(from, "acpi=strict", 11)) {
-			acpi_strict = 1;
-		}
-
-		/* Limit ACPI just to boot-time to enable HT */
-		else if (!memcmp(from, "acpi=ht", 7)) {
-			if (!acpi_force)
-				disable_acpi();
-			acpi_ht = 1;
-		}
-		
-		/* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
-		else if (!memcmp(from, "pci=noacpi", 10)) {
-			acpi_disable_pci();
-		}
-		/* "acpi=noirq" disables ACPI interrupt routing */
-		else if (!memcmp(from, "acpi=noirq", 10)) {
-			acpi_noirq_set();
+		mem_size = memparse(arg, &arg);
+		if (*arg == '@') {
+			start_at = memparse(arg+1, &arg);
+			add_memory_region(start_at, mem_size, E820_RAM);
+		} else if (*arg == '#') {
+			start_at = memparse(arg+1, &arg);
+			add_memory_region(start_at, mem_size, E820_ACPI);
+		} else if (*arg == '$') {
+			start_at = memparse(arg+1, &arg);
+			add_memory_region(start_at, mem_size, E820_RESERVED);
+		} else {
+			limit_regions(mem_size);
+			user_defined_memmap = 1;
 		}
+	}
+	return 0;
+}
+early_param("memmap", parse_memmap);
 
-		else if (!memcmp(from, "acpi_sci=edge", 13))
-			acpi_sci_flags.trigger =  1;
-
-		else if (!memcmp(from, "acpi_sci=level", 14))
-			acpi_sci_flags.trigger = 3;
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel.
+ */
+static int __init parse_elfcorehdr(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
 
-		else if (!memcmp(from, "acpi_sci=high", 13))
-			acpi_sci_flags.polarity = 1;
+	elfcorehdr_addr = memparse(arg, &arg);
+	return 0;
+}
+early_param("elfcorehdr", parse_elfcorehdr);
+#endif /* CONFIG_PROC_VMCORE */
 
-		else if (!memcmp(from, "acpi_sci=low", 12))
-			acpi_sci_flags.polarity = 3;
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
 
-#ifdef CONFIG_X86_IO_APIC
-		else if (!memcmp(from, "acpi_skip_timer_override", 24))
-			acpi_skip_timer_override = 1;
+	highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+	return 0;
+}
+early_param("highmem", parse_highmem);
 
-		if (!memcmp(from, "disable_timer_pin_1", 19))
-			disable_timer_pin_1 = 1;
-		if (!memcmp(from, "enable_timer_pin_1", 18))
-			disable_timer_pin_1 = -1;
-
-		/* disable IO-APIC */
-		else if (!memcmp(from, "noapic", 6))
-			disable_ioapic_setup();
-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
 
-#ifdef CONFIG_X86_LOCAL_APIC
-		/* enable local APIC */
-		else if (!memcmp(from, "lapic", 5))
-			lapic_enable();
-
-		/* disable local APIC */
-		else if (!memcmp(from, "nolapic", 6))
-			lapic_disable();
-#endif /* CONFIG_X86_LOCAL_APIC */
+	__VMALLOC_RESERVE = memparse(arg, &arg);
+	return 0;
+}
+early_param("vmalloc", parse_vmalloc);
 
-#ifdef CONFIG_KEXEC
-		/* crashkernel=size@addr specifies the location to reserve for
-		 * a crash kernel.  By reserving this memory we guarantee
-		 * that linux never set's it up as a DMA target.
-		 * Useful for holding code to do something appropriate
-		 * after a kernel panic.
-		 */
-		else if (!memcmp(from, "crashkernel=", 12)) {
 #ifndef CONFIG_XEN
-			unsigned long size, base;
-			size = memparse(from+12, &from);
-			if (*from == '@') {
-				base = memparse(from+1, &from);
-				/* FIXME: Do I want a sanity check
-				 * to validate the memory range?
-				 */
-				crashk_res.start = base;
-				crashk_res.end   = base + size - 1;
-			}
-#else
-			printk("Ignoring crashkernel command line, "
-			       "parameter will be supplied by xen\n");
-#endif
-		}
-#endif
-#ifdef CONFIG_PROC_VMCORE
-		/* elfcorehdr= specifies the location of elf core header
-		 * stored by the crashed kernel.
-		 */
-		else if (!memcmp(from, "elfcorehdr=", 11))
-			elfcorehdr_addr = memparse(from+11, &from);
-#endif
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+	unsigned long address;
 
-		/*
-		 * highmem=size forces highmem to be exactly 'size' bytes.
-		 * This works even on boxes that have no highmem otherwise.
-		 * This also works to reduce highmem size on bigger boxes.
-		 */
-		else if (!memcmp(from, "highmem=", 8))
-			highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
-	
-		/*
-		 * vmalloc=size forces the vmalloc area to be exactly 'size'
-		 * bytes. This can be used to increase (or decrease) the
-		 * vmalloc area - the default is 128m.
-		 */
-		else if (!memcmp(from, "vmalloc=", 8))
-			__VMALLOC_RESERVE = memparse(from+8, &from);
+	if (!arg)
+		return -EINVAL;
 
-	next_char:
-		c = *(from++);
-		if (!c)
-			break;
-		if (COMMAND_LINE_SIZE <= ++len)
-			break;
-		*(to++) = c;
-	}
-	*to = '\0';
-	*cmdline_p = command_line;
-	if (userdef) {
-		printk(KERN_INFO "user-defined physical RAM map:\n");
-		print_memory_map("user");
-	}
+	address = memparse(arg, &arg);
+	reserve_top_address(address);
+	return 0;
 }
+early_param("reservetop", parse_reservetop);
+#endif
 
 /*
  * Callback for efi_memory_walk.
@@ -1039,7 +927,7 @@ efi_find_max_pfn(unsigned long start, un
 static int __init
 efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
 {
-	memory_present(0, start, end);
+	memory_present(0, PFN_UP(start), PFN_DOWN(end));
 	return 0;
 }
 
@@ -1306,6 +1194,14 @@ static unsigned long __init setup_memory
 	}
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 		pages_to_mb(highend_pfn - highstart_pfn));
+	num_physpages = highend_pfn;
+	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+	num_physpages = max_low_pfn;
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+	max_mapnr = num_physpages;
 #endif
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
 			pages_to_mb(max_low_pfn));
@@ -1317,22 +1213,19 @@ static unsigned long __init setup_memory
 
 void __init zone_sizes_init(void)
 {
-	unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
-	unsigned int max_dma, low;
-
-	max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-	low = max_low_pfn;
-
-	if (low < max_dma)
-		zones_size[ZONE_DMA] = low;
-	else {
-		zones_size[ZONE_DMA] = max_dma;
-		zones_size[ZONE_NORMAL] = low - max_dma;
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+	max_zone_pfns[ZONE_DMA] =
+		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
-		zones_size[ZONE_HIGHMEM] = highend_pfn - low;
+	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+	add_active_range(0, 0, highend_pfn);
+#else
+	add_active_range(0, 0, max_low_pfn);
 #endif
-	}
-	free_area_init(zones_size);
+
+	free_area_init_nodes(max_zone_pfns);
 }
 #else
 extern unsigned long __init setup_memory(void);
@@ -1389,6 +1282,7 @@ void __init setup_bootmem_allocator(void
 	 */
 	acpi_reserve_bootmem();
 #endif
+	numa_kva_reserve();
 #endif /* !CONFIG_XEN */
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -1574,7 +1468,7 @@ static int __init request_standard_resou
 	request_resource(&iomem_resource, &video_ram_resource);
 
 	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < STANDARD_IO_RESOURCES; i++)
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
 		request_resource(&ioport_resource, &standard_io_resources[i]);
 	return 0;
 }
@@ -1705,17 +1599,19 @@ void __init setup_arch(char **cmdline_p)
 	data_resource.start = virt_to_phys(_etext);
 	data_resource.end = virt_to_phys(_edata)-1;
 
-	parse_cmdline_early(cmdline_p);
+	if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
+		i = COMMAND_LINE_SIZE;
+	memcpy(saved_command_line, xen_start_info->cmd_line, i);
+	saved_command_line[i - 1] = '\0';
+	parse_early_param();
 
-#ifdef CONFIG_EARLY_PRINTK
-	{
-		char *s = strstr(*cmdline_p, "earlyprintk=");
-		if (s) {
-			setup_early_printk(strchr(s, '=') + 1);
-			printk("early console enabled\n");
-		}
+	if (user_defined_memmap) {
+		printk(KERN_INFO "user-defined physical RAM map:\n");
+		print_memory_map("user");
 	}
-#endif
+
+	strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
+	*cmdline_p = command_line;
 
 	max_low_pfn = setup_memory();
 
@@ -1822,7 +1718,7 @@ void __init setup_arch(char **cmdline_p)
 		dmi_scan_machine();
 
 #ifdef CONFIG_X86_GENERICARCH
-	generic_apic_probe(*cmdline_p);
+	generic_apic_probe();
 #endif	
 	if (efi_enabled)
 		efi_map_memmap();
@@ -1843,9 +1739,11 @@ void __init setup_arch(char **cmdline_p)
 	acpi_boot_table_init();
 #endif
 
+#ifdef CONFIG_PCI
 #ifdef CONFIG_X86_IO_APIC
 	check_acpi_pci();	/* Checks more than just ACPI actually */
 #endif
+#endif
 
 #ifdef CONFIG_ACPI
 	acpi_boot_init();
Index: head-2008-07-15/arch/x86/kernel/smp_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/smp_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/smp_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -279,8 +279,7 @@ static inline void leave_mm (unsigned lo
  * 2) Leave the mm if we are in the lazy tlb mode.
  */
 
-irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
-				     struct pt_regs *regs)
+irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id)
 {
 	unsigned long cpu;
 
@@ -567,16 +566,14 @@ void smp_send_stop(void)
  * all the work is done automatically when
  * we return from the interrupt.
  */
-irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
-				     struct pt_regs *regs)
+irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
 {
 
 	return IRQ_HANDLED;
 }
 
 #include <linux/kallsyms.h>
-irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
-					struct pt_regs *regs)
+irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
 {
 	void (*func) (void *info) = call_data->func;
 	void *info = call_data->info;
@@ -603,3 +600,69 @@ irqreturn_t smp_call_function_interrupt(
 	return IRQ_HANDLED;
 }
 
+/*
+ * this function sends a 'generic call function' IPI to one other CPU
+ * in the system.
+ *
+ * cpu is a standard Linux logical CPU number.
+ */
+static void
+__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+				int nonatomic, int wait)
+{
+	struct call_data_struct data;
+	int cpus = 1;
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	wmb();
+	/* Send a message to all other CPUs and wait for them to respond */
+	send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus)
+		cpu_relax();
+
+	if (!wait)
+		return;
+
+	while (atomic_read(&data.finished) != cpus)
+		cpu_relax();
+}
+
+/*
+ * smp_call_function_single - Run a function on another CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Currently unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Retrurns 0 on success, else a negative status code.
+ *
+ * Does not return until the remote CPU is nearly ready to execute <func>
+ * or is or has executed.
+ */
+
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+			int nonatomic, int wait)
+{
+	/* prevent preemption and reschedule on another processor */
+	int me = get_cpu();
+	if (cpu == me) {
+		WARN_ON(1);
+		put_cpu();
+		return -EBUSY;
+	}
+	spin_lock_bh(&call_lock);
+	__smp_call_function_single(cpu, func, info, nonatomic, wait);
+	spin_unlock_bh(&call_lock);
+	put_cpu();
+	return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
Index: head-2008-07-15/arch/x86/kernel/time_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/time_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/time_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -89,7 +89,6 @@ int pit_latch_buggy;              /* ext
 unsigned long vxtime_hz = PIT_TICK_RATE;
 struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
-unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
 struct timespec __xtime __section_xtime;
 struct timezone __sys_tz __section_sys_tz;
 #endif
@@ -97,8 +96,6 @@ struct timezone __sys_tz __section_sys_t
 unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
 EXPORT_SYMBOL(cpu_khz);
 
-extern unsigned long wall_jiffies;
-
 DEFINE_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
@@ -265,11 +262,10 @@ static void __update_wallclock(time_t se
 	time_t wtm_sec, xtime_sec;
 	u64 tmp, wc_nsec;
 
-	/* Adjust wall-clock time base based on wall_jiffies ticks. */
+	/* Adjust wall-clock time base. */
 	wc_nsec = processed_system_time;
 	wc_nsec += sec * (u64)NSEC_PER_SEC;
 	wc_nsec += nsec;
-	wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
 
 	/* Split wallclock base into seconds and nanoseconds. */
 	tmp = wc_nsec;
@@ -387,16 +383,10 @@ void do_gettimeofday(struct timeval *tv)
 	shadow = &per_cpu(shadow_time, cpu);
 
 	do {
-		unsigned long lost;
-
 		local_time_version = shadow->version;
 		seq = read_seqbegin(&xtime_lock);
 
 		usec = get_usec_offset(shadow);
-		lost = jiffies - wall_jiffies;
-
-		if (unlikely(lost))
-			usec += lost * (USEC_PER_SEC / HZ);
 
 		sec = xtime.tv_sec;
 		usec += (xtime.tv_nsec / NSEC_PER_USEC);
@@ -519,7 +509,7 @@ static void sync_xen_wallclock(unsigned 
 	write_seqlock_irq(&xtime_lock);
 
 	sec  = xtime.tv_sec;
-	nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
+	nsec = xtime.tv_nsec;
 	__normalize_time(&sec, &nsec);
 
 	op.cmd = XENPF_settime;
@@ -593,42 +583,49 @@ unsigned long long sched_clock(void)
 }
 #endif
 
-#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
 unsigned long profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
-#ifdef __x86_64__
-	/* Assume the lock function has either no stack frame or only a single word.
-	   This checks if the address on the stack looks like a kernel text address.
-	   There is a small window for false hits, but in that case the tick
-	   is just accounted to the spinlock function.
-	   Better would be to write these functions in assembler again
-	   and check exactly. */
+#if defined(CONFIG_SMP) || defined(__x86_64__)
 	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
-		char *v = *(char **)regs->rsp;
-		if ((v >= _stext && v <= _etext) ||
-			(v >= _sinittext && v <= _einittext) ||
-			(v >= (char *)MODULES_VADDR  && v <= (char *)MODULES_END))
-			return (unsigned long)v;
-		return ((unsigned long *)regs->rsp)[1];
+# ifdef CONFIG_FRAME_POINTER
+#  ifdef __i386__
+		return ((unsigned long *)regs->ebp)[1];
+#  else
+		return ((unsigned long *)regs->rbp)[1];
+#  endif
+# else
+#  ifdef __i386__
+		unsigned long *sp;
+		if ((regs->xcs & 2) == 0)
+			sp = (unsigned long *)&regs->esp;
+		else
+			sp = (unsigned long *)regs->esp;
+#  else
+		unsigned long *sp = (unsigned long *)regs->rsp;
+#  endif
+		/* Return address is either directly at stack pointer
+		   or above a saved eflags. Eflags has bits 22-31 zero,
+		   kernel addresses don't. */
+ 		if (sp[0] >> 22)
+			return sp[0];
+		if (sp[1] >> 22)
+			return sp[1];
+# endif
 	}
-#else
-	if (!user_mode_vm(regs) && in_lock_functions(pc))
-		return *(unsigned long *)(regs->ebp + 4);
 #endif
 
 	return pc;
 }
 EXPORT_SYMBOL(profile_pc);
-#endif
 
 /*
  * This is the same as the above, except we _also_ save the current
  * Time Stamp Counter value at the time of the timer interrupt, so that
  * we later on can estimate the time of day more exactly.
  */
-irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+irqreturn_t timer_interrupt(int irq, void *dev_id)
 {
 	s64 delta, delta_cpu, stolen, blocked;
 	u64 sched_time;
@@ -686,10 +683,14 @@ irqreturn_t timer_interrupt(int irq, voi
 	}
 
 	/* System-wide jiffy work. */
-	while (delta >= NS_PER_TICK) {
-		delta -= NS_PER_TICK;
-		processed_system_time += NS_PER_TICK;
-		do_timer(regs);
+	if (delta >= NS_PER_TICK) {
+		do_div(delta, NS_PER_TICK);
+		processed_system_time += delta * NS_PER_TICK;
+		while (delta > HZ) {
+			do_timer(HZ);
+			delta -= HZ;
+		}
+		do_timer(delta);
 	}
 
 	if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
@@ -734,7 +735,7 @@ irqreturn_t timer_interrupt(int irq, voi
 	if (delta_cpu > 0) {
 		do_div(delta_cpu, NS_PER_TICK);
 		per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
-		if (user_mode_vm(regs))
+		if (user_mode_vm(get_irq_regs()))
 			account_user_time(current, (cputime_t)delta_cpu);
 		else
 			account_system_time(current, HARDIRQ_OFFSET,
@@ -748,10 +749,10 @@ irqreturn_t timer_interrupt(int irq, voi
 	/* Local timer processing (see update_process_times()). */
 	run_local_timers();
 	if (rcu_pending(cpu))
-		rcu_check_callbacks(cpu, user_mode_vm(regs));
+		rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs()));
 	scheduler_tick();
 	run_posix_cpu_timers(current);
-	profile_tick(CPU_PROFILING, regs);
+	profile_tick(CPU_PROFILING);
 
 	return IRQ_HANDLED;
 }
@@ -959,10 +960,11 @@ extern void (*late_time_init)(void);
 /* Duplicate of time_init() below, with hpet_enable part added */
 static void __init hpet_time_init(void)
 {
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
+	struct timespec ts;
+	ts.tv_sec = get_cmos_time();
+	ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+
+	do_settimeofday(&ts);
 
 	if ((hpet_enable() >= 0) && hpet_use_timer) {
 		printk("Using HPET for base-timer\n");
Index: head-2008-07-15/arch/x86/kernel/traps_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/traps_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/traps_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -28,6 +28,7 @@
 #include <linux/kprobes.h>
 #include <linux/kexec.h>
 #include <linux/unwind.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
@@ -40,7 +41,6 @@
 
 #include <asm/processor.h>
 #include <asm/system.h>
-#include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/atomic.h>
 #include <asm/debugreg.h>
@@ -51,11 +51,14 @@
 #include <asm/smp.h>
 #include <asm/arch_hooks.h>
 #include <asm/kdebug.h>
+#include <asm/stacktrace.h>
 
 #include <linux/module.h>
 
 #include "mach_traps.h"
 
+int panic_on_unrecovered_nmi;
+
 asmlinkage int system_call(void);
 
 struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
@@ -124,62 +127,63 @@ static inline int valid_stack_ptr(struct
 		p < (void *)tinfo + THREAD_SIZE - 3;
 }
 
-/*
- * Print one address/symbol entries per line.
- */
-static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl)
-{
-	printk(" [<%08lx>] ", addr);
-
-	print_symbol("%s\n", addr);
-}
-
 static inline unsigned long print_context_stack(struct thread_info *tinfo,
 				unsigned long *stack, unsigned long ebp,
-				char *log_lvl)
+				struct stacktrace_ops *ops, void *data)
 {
 	unsigned long addr;
 
 #ifdef	CONFIG_FRAME_POINTER
 	while (valid_stack_ptr(tinfo, (void *)ebp)) {
+		unsigned long new_ebp;
 		addr = *(unsigned long *)(ebp + 4);
-		print_addr_and_symbol(addr, log_lvl);
+		ops->address(data, addr);
 		/*
 		 * break out of recursive entries (such as
-		 * end_of_stack_stop_unwind_function):
+		 * end_of_stack_stop_unwind_function). Also,
+		 * we can never allow a frame pointer to
+		 * move downwards!
 	 	 */
-		if (ebp == *(unsigned long *)ebp)
+	 	new_ebp = *(unsigned long *)ebp;
+		if (new_ebp <= ebp)
 			break;
-		ebp = *(unsigned long *)ebp;
+		ebp = new_ebp;
 	}
 #else
 	while (valid_stack_ptr(tinfo, stack)) {
 		addr = *stack++;
 		if (__kernel_text_address(addr))
-			print_addr_and_symbol(addr, log_lvl);
+			ops->address(data, addr);
 	}
 #endif
 	return ebp;
 }
 
+struct ops_and_data {
+	struct stacktrace_ops *ops;
+	void *data;
+};
+
 static asmlinkage int
-show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
+dump_trace_unwind(struct unwind_frame_info *info, void *data)
 {
+	struct ops_and_data *oad = (struct ops_and_data *)data;
 	int n = 0;
 
 	while (unwind(info) == 0 && UNW_PC(info)) {
 		n++;
-		print_addr_and_symbol(UNW_PC(info), log_lvl);
+		oad->ops->address(oad->data, UNW_PC(info));
 		if (arch_unw_user_mode(info))
 			break;
 	}
 	return n;
 }
 
-static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-			       unsigned long *stack, char *log_lvl)
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
+	        unsigned long *stack,
+		struct stacktrace_ops *ops, void *data)
 {
-	unsigned long ebp;
+	unsigned long ebp = 0;
 
 	if (!task)
 		task = current;
@@ -187,54 +191,116 @@ static void show_trace_log_lvl(struct ta
 	if (call_trace >= 0) {
 		int unw_ret = 0;
 		struct unwind_frame_info info;
+		struct ops_and_data oad = { .ops = ops, .data = data };
 
 		if (regs) {
 			if (unwind_init_frame_info(&info, task, regs) == 0)
-				unw_ret = show_trace_unwind(&info, log_lvl);
+				unw_ret = dump_trace_unwind(&info, &oad);
 		} else if (task == current)
-			unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
+			unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
 		else {
 			if (unwind_init_blocked(&info, task) == 0)
-				unw_ret = show_trace_unwind(&info, log_lvl);
+				unw_ret = dump_trace_unwind(&info, &oad);
 		}
 		if (unw_ret > 0) {
 			if (call_trace == 1 && !arch_unw_user_mode(&info)) {
-				print_symbol("DWARF2 unwinder stuck at %s\n",
+				ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
 					     UNW_PC(&info));
 				if (UNW_SP(&info) >= PAGE_OFFSET) {
-					printk("Leftover inexact backtrace:\n");
+					ops->warning(data, "Leftover inexact backtrace:\n");
 					stack = (void *)UNW_SP(&info);
+					if (!stack)
+						return;
+					ebp = UNW_FP(&info);
 				} else
-					printk("Full inexact backtrace again:\n");
+					ops->warning(data, "Full inexact backtrace again:\n");
 			} else if (call_trace >= 1)
 				return;
 			else
-				printk("Full inexact backtrace again:\n");
+				ops->warning(data, "Full inexact backtrace again:\n");
 		} else
-			printk("Inexact backtrace:\n");
+			ops->warning(data, "Inexact backtrace:\n");
 	}
-
-	if (task == current) {
-		/* Grab ebp right from our regs */
-		asm ("movl %%ebp, %0" : "=r" (ebp) : );
-	} else {
-		/* ebp is the last reg pushed by switch_to */
-		ebp = *(unsigned long *) task->thread.esp;
+	if (!stack) {
+		unsigned long dummy;
+		stack = &dummy;
+		if (task && task != current)
+			stack = (unsigned long *)task->thread.esp;
+	}
+
+#ifdef CONFIG_FRAME_POINTER
+	if (!ebp) {
+		if (task == current) {
+			/* Grab ebp right from our regs */
+			asm ("movl %%ebp, %0" : "=r" (ebp) : );
+		} else {
+			/* ebp is the last reg pushed by switch_to */
+			ebp = *(unsigned long *) task->thread.esp;
+		}
 	}
+#endif
 
 	while (1) {
 		struct thread_info *context;
 		context = (struct thread_info *)
 			((unsigned long)stack & (~(THREAD_SIZE - 1)));
-		ebp = print_context_stack(context, stack, ebp, log_lvl);
+		ebp = print_context_stack(context, stack, ebp, ops, data);
+		/* Should be after the line below, but somewhere
+		   in early boot context comes out corrupted and we
+		   can't reference it -AK */
+		if (ops->stack(data, "IRQ") < 0)
+			break;
 		stack = (unsigned long*)context->previous_esp;
 		if (!stack)
 			break;
-		printk("%s =======================\n", log_lvl);
 	}
 }
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	printk(data);
+	print_symbol(msg, symbol);
+	printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+	printk("%s%s\n", (char *)data, msg);
+}
 
-void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
+static int print_trace_stack(void *data, char *name)
+{
+	return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr)
+{
+	printk("%s [<%08lx>] ", (char *)data, addr);
+	print_symbol("%s\n", addr);
+}
+
+static struct stacktrace_ops print_trace_ops = {
+	.warning = print_trace_warning,
+	.warning_symbol = print_trace_warning_symbol,
+	.stack = print_trace_stack,
+	.address = print_trace_address,
+};
+
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		   unsigned long * stack, char *log_lvl)
+{
+	dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
+	printk("%s =======================\n", log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+		unsigned long * stack)
 {
 	show_trace_log_lvl(task, regs, stack, "");
 }
@@ -297,12 +363,13 @@ void show_registers(struct pt_regs *regs
 		ss = regs->xss & 0xffff;
 	}
 	print_modules();
-	printk(KERN_EMERG "CPU:    %d\nEIP:    %04x:[<%08lx>]    %s VLI\n"
-			"EFLAGS: %08lx   (%s %.*s) \n",
+	printk(KERN_EMERG "CPU:    %d\n"
+		KERN_EMERG "EIP:    %04x:[<%08lx>]    %s VLI\n"
+		KERN_EMERG "EFLAGS: %08lx   (%s %.*s)\n",
 		smp_processor_id(), 0xffff & regs->xcs, regs->eip,
-		print_tainted(), regs->eflags, system_utsname.release,
-		(int)strcspn(system_utsname.version, " "),
-		system_utsname.version);
+		print_tainted(), regs->eflags, init_utsname()->release,
+		(int)strcspn(init_utsname()->version, " "),
+		init_utsname()->version);
 	print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
 	printk(KERN_EMERG "eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
 		regs->eax, regs->ebx, regs->ecx, regs->edx);
@@ -319,6 +386,8 @@ void show_registers(struct pt_regs *regs
 	 */
 	if (in_kernel) {
 		u8 __user *eip;
+		int code_bytes = 64;
+		unsigned char c;
 
 		printk("\n" KERN_EMERG "Stack: ");
 		show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
@@ -326,9 +395,12 @@ void show_registers(struct pt_regs *regs
 		printk(KERN_EMERG "Code: ");
 
 		eip = (u8 __user *)regs->eip - 43;
-		for (i = 0; i < 64; i++, eip++) {
-			unsigned char c;
-
+		if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
+			/* try starting at EIP */
+			eip = (u8 __user *)regs->eip;
+			code_bytes = 32;
+		}
+		for (i = 0; i < code_bytes; i++, eip++) {
 			if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
 				printk(" Bad EIP value.");
 				break;
@@ -349,7 +421,7 @@ static void handle_BUG(struct pt_regs *r
 
 	if (eip < PAGE_OFFSET)
 		return;
-	if (__get_user(ud2, (unsigned short __user *)eip))
+	if (probe_kernel_address((unsigned short __user *)eip, ud2))
 		return;
 	if (ud2 != 0x0b0f)
 		return;
@@ -362,7 +434,8 @@ static void handle_BUG(struct pt_regs *r
 		char *file;
 		char c;
 
-		if (__get_user(line, (unsigned short __user *)(eip + 2)))
+		if (probe_kernel_address((unsigned short __user *)(eip + 2),
+					line))
 			break;
 		if (__get_user(file, (char * __user *)(eip + 4)) ||
 		    (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
@@ -604,18 +677,24 @@ gp_in_kernel:
 	}
 }
 
-static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
+static __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs * regs)
 {
-	printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
-			"to continue\n");
+	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+		"CPU %d.\n", reason, smp_processor_id());
 	printk(KERN_EMERG "You probably have a hardware problem with your RAM "
 			"chips\n");
+	if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+
+	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
 
 	/* Clear and disable the memory parity error line. */
 	clear_mem_error(reason);
 }
 
-static void io_check_error(unsigned char reason, struct pt_regs * regs)
+static __kprobes void
+io_check_error(unsigned char reason, struct pt_regs * regs)
 {
 	printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
 	show_registers(regs);
@@ -624,7 +703,8 @@ static void io_check_error(unsigned char
 	clear_io_check_error(reason);
 }
 
-static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+static __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 {
 #ifdef CONFIG_MCA
 	/* Might actually be able to figure out what the guilty party
@@ -634,15 +714,18 @@ static void unknown_nmi_error(unsigned c
 		return;
 	}
 #endif
-	printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
-		reason, smp_processor_id());
-	printk("Dazed and confused, but trying to continue\n");
-	printk("Do you have a strange power saving mode enabled?\n");
+	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+		"CPU %d.\n", reason, smp_processor_id());
+	printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+	if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+
+	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
 }
 
 static DEFINE_SPINLOCK(nmi_print_lock);
 
-void die_nmi (struct pt_regs *regs, const char *msg)
+void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
 {
 	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
 	    NOTIFY_STOP)
@@ -674,7 +757,7 @@ void die_nmi (struct pt_regs *regs, cons
 	do_exit(SIGSEGV);
 }
 
-static void default_do_nmi(struct pt_regs * regs)
+static __kprobes void default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = 0;
 
@@ -691,12 +774,12 @@ static void default_do_nmi(struct pt_reg
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
 		 */
-		if (nmi_watchdog) {
-			nmi_watchdog_tick(regs);
+		if (nmi_watchdog_tick(regs, reason))
 			return;
-		}
+		if (!do_nmi_callback(regs, smp_processor_id()))
 #endif
-		unknown_nmi_error(reason, regs);
+			unknown_nmi_error(reason, regs);
+
 		return;
 	}
 	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
@@ -712,14 +795,7 @@ static void default_do_nmi(struct pt_reg
 	reassert_nmi();
 }
 
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
-{
-	return 0;
-}
- 
-static nmi_callback_t nmi_callback = dummy_nmi_callback;
- 
-fastcall void do_nmi(struct pt_regs * regs, long error_code)
+fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
 {
 	int cpu;
 
@@ -729,25 +805,11 @@ fastcall void do_nmi(struct pt_regs * re
 
 	++nmi_count(cpu);
 
-	if (!rcu_dereference(nmi_callback)(regs, cpu))
-		default_do_nmi(regs);
+	default_do_nmi(regs);
 
 	nmi_exit();
 }
 
-void set_nmi_callback(nmi_callback_t callback)
-{
-	vmalloc_sync_all();
-	rcu_assign_pointer(nmi_callback, callback);
-}
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-
-void unset_nmi_callback(void)
-{
-	nmi_callback = dummy_nmi_callback;
-}
-EXPORT_SYMBOL_GPL(unset_nmi_callback);
-
 #ifdef CONFIG_KPROBES
 fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
Index: head-2008-07-15/arch/x86/mach-xen/setup.c
===================================================================
--- head-2008-07-15.orig/arch/x86/mach-xen/setup.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/mach-xen/setup.c	2008-07-15 14:24:17.000000000 +0200
@@ -103,8 +103,10 @@ void __init pre_setup_arch_hook(void)
 
 	setup_xen_features();
 
-	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
-		set_fixaddr_top(pp.virt_start);
+	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) {
+		hypervisor_virt_start = pp.virt_start;
+		reserve_top_address(0UL - pp.virt_start);
+	}
 
 	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
 		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
Index: head-2008-07-15/arch/x86/mm/fault_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/mm/fault_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/mm/fault_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -27,21 +27,24 @@
 #include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/kdebug.h>
+#include <asm/segment.h>
 
 extern void die(const char *,struct pt_regs *,long);
 
-#ifdef CONFIG_KPROBES
-ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+
 int register_page_fault_notifier(struct notifier_block *nb)
 {
 	vmalloc_sync_all();
 	return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
 }
+EXPORT_SYMBOL_GPL(register_page_fault_notifier);
 
 int unregister_page_fault_notifier(struct notifier_block *nb)
 {
 	return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
 }
+EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
 
 static inline int notify_page_fault(enum die_val val, const char *str,
 			struct pt_regs *regs, long err, int trap, int sig)
@@ -55,14 +58,6 @@ static inline int notify_page_fault(enum
 	};
 	return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
 }
-#else
-static inline int notify_page_fault(enum die_val val, const char *str,
-			struct pt_regs *regs, long err, int trap, int sig)
-{
-	return NOTIFY_DONE;
-}
-#endif
-
 
 /*
  * Unlock any spinlocks which will prevent us from getting the
@@ -119,10 +114,10 @@ static inline unsigned long get_segment_
 	}
 
 	/* The standard kernel/user address space limit. */
-	*eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
+	*eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
 	
 	/* By far the most common cases. */
-	if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
+	if (likely(SEGMENT_IS_FLAT_CODE(seg)))
 		return eip;
 
 	/* Check the segment exists, is within the current LDT/GDT size,
@@ -559,11 +554,7 @@ good_area:
 	write = 0;
 	switch (error_code & 3) {
 		default:	/* 3: write, present */
-#ifdef TEST_VERIFY_AREA
-			if (regs->cs == GET_KERNEL_CS())
-				printk("WP fault at %08lx\n", regs->eip);
-#endif
-			/* fall through */
+				/* fall through */
 		case 2:		/* write, not present */
 			if (!(vma->vm_flags & VM_WRITE))
 				goto bad_area;
@@ -572,7 +563,7 @@ good_area:
 		case 1:		/* read, present */
 			goto bad_area;
 		case 0:		/* read, not present */
-			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+			if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 				goto bad_area;
 	}
 
@@ -704,7 +695,7 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (tsk->pid == 1) {
+	if (is_init(tsk)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
Index: head-2008-07-15/arch/x86/mm/highmem_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/mm/highmem_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/mm/highmem_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -38,11 +38,9 @@ static void *__kmap_atomic(struct page *
 
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-#ifdef CONFIG_DEBUG_HIGHMEM
 	if (!pte_none(*(kmap_pte-idx)))
 		BUG();
-#endif
-	set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
+	set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
 
 	return (void*) vaddr;
 }
@@ -62,36 +60,26 @@ void *kmap_atomic_pte(struct page *page,
 
 void kunmap_atomic(void *kvaddr, enum km_type type)
 {
-#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN)
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
-	if (vaddr < FIXADDR_START) { // FIXME
+#ifdef CONFIG_DEBUG_HIGHMEM
+	if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) {
 		dec_preempt_count();
 		preempt_check_resched();
 		return;
 	}
-#endif
 
-#if defined(CONFIG_DEBUG_HIGHMEM)
 	if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
 		BUG();
-
-	/*
-	 * force other mappings to Oops if they'll try to access
-	 * this pte without first remap it
-	 */
-	pte_clear(&init_mm, vaddr, kmap_pte-idx);
-	__flush_tlb_one(vaddr);
-#elif defined(CONFIG_XEN)
+#endif
 	/*
-	 * We must ensure there are no dangling pagetable references when
-	 * returning memory to Xen (decrease_reservation).
-	 * XXX TODO: We could make this faster by only zapping when
-	 * kmap_flush_unused is called but that is trickier and more invasive.
+	 * Force other mappings to Oops if they'll try to access this pte
+	 * without first remap it.  Keeping stale mappings around is a bad idea
+	 * also, in case the page changes cacheability attributes or becomes
+	 * a protected page in a hypervisor.
 	 */
-	pte_clear(&init_mm, vaddr, kmap_pte-idx);
-#endif
+	kpte_clear_flush(kmap_pte-idx, vaddr);
 
 	dec_preempt_count();
 	preempt_check_resched();
@@ -110,7 +98,6 @@ void *kmap_atomic_pfn(unsigned long pfn,
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
-	__flush_tlb_one(vaddr);
 
 	return (void*) vaddr;
 }
Index: head-2008-07-15/arch/x86/mm/hypervisor.c
===================================================================
--- head-2008-07-15.orig/arch/x86/mm/hypervisor.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/mm/hypervisor.c	2008-07-15 14:24:17.000000000 +0200
@@ -569,7 +569,8 @@ int write_ldt_entry(void *ldt, int entry
 #define MAX_BATCHED_FULL_PTES 32
 
 int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
-			 unsigned long addr, unsigned long end, pgprot_t newprot)
+			 unsigned long addr, unsigned long end, pgprot_t newprot,
+			 int dirty_accountable)
 {
 	int rc = 0, i = 0;
 	mmu_update_t u[MAX_BATCHED_FULL_PTES];
@@ -582,10 +583,14 @@ int xen_change_pte_range(struct mm_struc
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
 		if (pte_present(*pte)) {
+			pte_t ptent = pte_modify(*pte, newprot);
+
+			if (dirty_accountable && pte_dirty(ptent))
+				ptent = pte_mkwrite(ptent);
 			u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
 				   | ((unsigned long)pte & ~PAGE_MASK)
 				   | MMU_PT_UPDATE_PRESERVE_AD;
-			u[i].val = __pte_val(pte_modify(*pte, newprot));
+			u[i].val = __pte_val(ptent);
 			if (++i == MAX_BATCHED_FULL_PTES) {
 				if ((rc = HYPERVISOR_mmu_update(
 					&u[0], i, NULL, DOMID_SELF)) != 0)
Index: head-2008-07-15/arch/x86/mm/init_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/mm/init_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/mm/init_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -464,16 +464,22 @@ EXPORT_SYMBOL(__supported_pte_mask);
  * on      Enable
  * off     Disable
  */
-void __init noexec_setup(const char *str)
+static int __init noexec_setup(char *str)
 {
-	if (!strncmp(str, "on",2) && cpu_has_nx) {
-		__supported_pte_mask |= _PAGE_NX;
-		disable_nx = 0;
-	} else if (!strncmp(str,"off",3)) {
+	if (!str || !strcmp(str, "on")) {
+		if (cpu_has_nx) {
+			__supported_pte_mask |= _PAGE_NX;
+			disable_nx = 0;
+		}
+	} else if (!strcmp(str,"off")) {
 		disable_nx = 1;
 		__supported_pte_mask &= ~_PAGE_NX;
-	}
+	} else
+		return -EINVAL;
+
+	return 0;
 }
+early_param("noexec", noexec_setup);
 
 int nx_enabled = 0;
 #ifdef CONFIG_X86_PAE
@@ -516,6 +522,7 @@ int __init set_kernel_exec(unsigned long
 		pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
 	else
 		pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
+	pte_update_defer(&init_mm, vaddr, pte);
 	__flush_tlb_all();
 out:
 	return ret;
@@ -598,18 +605,6 @@ static void __init test_wp_bit(void)
 	}
 }
 
-static void __init set_max_mapnr_init(void)
-{
-#ifdef CONFIG_HIGHMEM
-	num_physpages = highend_pfn;
-#else
-	num_physpages = max_low_pfn;
-#endif
-#ifdef CONFIG_FLATMEM
-	max_mapnr = num_physpages;
-#endif
-}
-
 static struct kcore_list kcore_mem, kcore_vmalloc; 
 
 void __init mem_init(void)
@@ -630,8 +625,7 @@ void __init mem_init(void)
 #endif
 
 #ifdef CONFIG_FLATMEM
-	if (!mem_map)
-		BUG();
+	BUG_ON(!mem_map);
 #endif
 	
 	bad_ppro = ppro_with_ram_bug();
@@ -646,17 +640,6 @@ void __init mem_init(void)
 	}
 #endif
  
-	set_max_mapnr_init();
-
-#ifdef CONFIG_HIGHMEM
-	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
-#else
-	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
-#endif
-	printk("vmalloc area: %lx-%lx, maxmem %lx\n",
-	       VMALLOC_START,VMALLOC_END,MAXMEM);
-	BUG_ON(VMALLOC_START > VMALLOC_END);
-	
 	/* this will put all low memory onto the freelists */
 	totalram_pages += free_all_bootmem();
 	/* XEN: init and count low-mem pages outside initial allocation. */
@@ -694,6 +677,48 @@ void __init mem_init(void)
 		(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
 	       );
 
+#if 1 /* double-sanity-check paranoia */
+	printk("virtual kernel memory layout:\n"
+	       "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#ifdef CONFIG_HIGHMEM
+	       "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#endif
+	       "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+	       "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+	       "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+	       "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+	       "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
+	       FIXADDR_START, FIXADDR_TOP,
+	       (FIXADDR_TOP - FIXADDR_START) >> 10,
+
+#ifdef CONFIG_HIGHMEM
+	       PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+	       (LAST_PKMAP*PAGE_SIZE) >> 10,
+#endif
+
+	       VMALLOC_START, VMALLOC_END,
+	       (VMALLOC_END - VMALLOC_START) >> 20,
+
+	       (unsigned long)__va(0), (unsigned long)high_memory,
+	       ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+	       (unsigned long)&__init_begin, (unsigned long)&__init_end,
+	       ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
+
+	       (unsigned long)&_etext, (unsigned long)&_edata,
+	       ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+	       (unsigned long)&_text, (unsigned long)&_etext,
+	       ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+#ifdef CONFIG_HIGHMEM
+	BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+	BUG_ON(VMALLOC_END                     > PKMAP_BASE);
+#endif
+	BUG_ON(VMALLOC_START                   > VMALLOC_END);
+	BUG_ON((unsigned long)high_memory      > VMALLOC_START);
+#endif /* double-sanity-check paranoia */
+
 #ifdef CONFIG_X86_PAE
 	if (!cpu_has_pae)
 		panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
@@ -724,7 +749,7 @@ void __init mem_init(void)
 int arch_add_memory(int nid, u64 start, u64 size)
 {
 	struct pglist_data *pgdata = &contig_page_data;
-	struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
+	struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
Index: head-2008-07-15/arch/x86/mm/ioremap_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/mm/ioremap_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/mm/ioremap_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <asm/fixmap.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -118,7 +118,7 @@ int direct_remap_pfn_range(struct vm_are
 	if (domid == DOMID_SELF)
 		return -EINVAL;
 
-	vma->vm_flags |= VM_IO | VM_RESERVED;
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
 
 	vma->vm_mm->context.has_foreign_mappings = 1;
 
@@ -203,6 +203,7 @@ void __iomem * __ioremap(unsigned long p
 	void __iomem * addr;
 	struct vm_struct * area;
 	unsigned long offset, last_addr;
+	pgprot_t prot;
 	domid_t domid = DOMID_IO;
 
 	/* Don't allow wraparound or zero size */
@@ -234,6 +235,8 @@ void __iomem * __ioremap(unsigned long p
 		domid = DOMID_SELF;
 	}
 
+	prot = __pgprot(_KERNPG_TABLE | flags);
+
 	/*
 	 * Mappings have to be page-aligned
 	 */
@@ -249,10 +252,9 @@ void __iomem * __ioremap(unsigned long p
 		return NULL;
 	area->phys_addr = phys_addr;
 	addr = (void __iomem *) area->addr;
-	flags |= _KERNPG_TABLE;
 	if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
 				     phys_addr>>PAGE_SHIFT,
-				     size, __pgprot(flags), domid)) {
+				     size, prot, domid)) {
 		vunmap((void __force *) addr);
 		return NULL;
 	}
Index: head-2008-07-15/arch/x86/mm/pgtable_32-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/mm/pgtable_32-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/mm/pgtable_32-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -68,7 +68,9 @@ void show_mem(void)
 	printk(KERN_INFO "%lu pages writeback\n",
 					global_page_state(NR_WRITEBACK));
 	printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
-	printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
+	printk(KERN_INFO "%lu pages slab\n",
+		global_page_state(NR_SLAB_RECLAIMABLE) +
+		global_page_state(NR_SLAB_UNRECLAIMABLE));
 	printk(KERN_INFO "%lu pages pagetables\n",
 					global_page_state(NR_PAGETABLE));
 }
@@ -108,18 +110,11 @@ void set_pmd_pfn(unsigned long vaddr, un
 	__flush_tlb_one(vaddr);
 }
 
-static int nr_fixmaps = 0;
+static int fixmaps;
 unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
-unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
+unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
 EXPORT_SYMBOL(__FIXADDR_TOP);
 
-void __init set_fixaddr_top(unsigned long top)
-{
-	BUG_ON(nr_fixmaps > 0);
-	hypervisor_virt_start = top;
-	__FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE;
-}
-
 void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
 {
 	unsigned long address = __fix_to_virt(idx);
@@ -141,7 +136,21 @@ void __set_fixmap (enum fixed_addresses 
 	if (HYPERVISOR_update_va_mapping(address, pte,
 					 UVMF_INVLPG|UVMF_ALL))
 		BUG();
-	nr_fixmaps++;
+	fixmaps++;
+}
+
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void __init reserve_top_address(unsigned long reserve)
+{
+	BUG_ON(fixmaps > 0);
+	__FIXADDR_TOP = -reserve - PAGE_SIZE;
+	__VMALLOC_RESERVE += reserve;
 }
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
Index: head-2008-07-15/arch/x86/pci/irq-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/pci/irq-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/pci/irq-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -991,10 +991,6 @@ static void __init pcibios_fixup_irqs(vo
 							pci_name(bridge), 'A' + pin, irq);
 				}
 				if (irq >= 0) {
-					if (use_pci_vector() &&
-						!platform_legacy_irq(irq))
-						irq = IO_APIC_VECTOR(irq);
-
 					printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
 						pci_name(dev), 'A' + pin, irq);
 					dev->irq = irq;
@@ -1155,10 +1151,6 @@ static int pirq_enable_irq(struct pci_de
 			}
 			dev = temp_dev;
 			if (irq >= 0) {
-#ifdef CONFIG_PCI_MSI
-				if (!platform_legacy_irq(irq))
-					irq = IO_APIC_VECTOR(irq);
-#endif
 				printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
 					pci_name(dev), 'A' + pin, irq);
 				dev->irq = irq;
@@ -1179,33 +1171,3 @@ static int pirq_enable_irq(struct pci_de
 	}
 	return 0;
 }
-
-int pci_vector_resources(int last, int nr_released)
-{
-	int count = nr_released;
-
-	int next = last;
-	int offset = (last % 8);
-
-	while (next < FIRST_SYSTEM_VECTOR) {
-		next += 8;
-#ifdef CONFIG_X86_64
-		if (next == IA32_SYSCALL_VECTOR)
-			continue;
-#else
-		if (next == SYSCALL_VECTOR)
-			continue;
-#endif
-		count++;
-		if (next >= FIRST_SYSTEM_VECTOR) {
-			if (offset%8) {
-				next = FIRST_DEVICE_VECTOR + offset;
-				offset++;
-				continue;
-			}
-			count--;
-		}
-	}
-
-	return count;
-}
Index: head-2008-07-15/arch/x86/ia32/ia32entry-xen.S
===================================================================
--- head-2008-07-15.orig/arch/x86/ia32/ia32entry-xen.S	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/ia32/ia32entry-xen.S	2008-07-15 14:24:17.000000000 +0200
@@ -83,6 +83,7 @@
  */ 	
 ENTRY(ia32_sysenter_target)
 	CFI_STARTPROC32	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,SS+8-RIP+16
 	/*CFI_REL_OFFSET	ss,SS-RIP+16*/
 	CFI_REL_OFFSET	rsp,RSP-RIP+16
@@ -164,6 +165,7 @@ ENDPROC(ia32_sysenter_target)
  */ 	
 ENTRY(ia32_cstar_target)
 	CFI_STARTPROC32	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,SS+8-RIP+16
 	/*CFI_REL_OFFSET	ss,SS-RIP+16*/
 	CFI_REL_OFFSET	rsp,RSP-RIP+16
@@ -243,6 +245,7 @@ ia32_badarg:
 
 ENTRY(ia32_syscall)
 	CFI_STARTPROC	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,SS+8-RIP+16
 	/*CFI_REL_OFFSET	ss,SS-RIP+16*/
 	CFI_REL_OFFSET	rsp,RSP-RIP+16
@@ -320,6 +323,7 @@ ENTRY(ia32_ptregs_common)
 	popq %r11
 	CFI_ENDPROC
 	CFI_STARTPROC32	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,SS+8-ARGOFFSET
 	CFI_REL_OFFSET	rax,RAX-ARGOFFSET
 	CFI_REL_OFFSET	rcx,RCX-ARGOFFSET
@@ -653,8 +657,8 @@ ia32_sys_call_table:
 	.quad sys_readlinkat		/* 305 */
 	.quad sys_fchmodat
 	.quad sys_faccessat
-	.quad quiet_ni_syscall		/* pselect6 for now */
-	.quad quiet_ni_syscall		/* ppoll for now */
+	.quad compat_sys_pselect6
+	.quad compat_sys_ppoll
 	.quad sys_unshare		/* 310 */
 	.quad compat_sys_set_robust_list
 	.quad compat_sys_get_robust_list
@@ -663,4 +667,5 @@ ia32_sys_call_table:
 	.quad sys_tee
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
+	.quad sys_getcpu
 ia32_syscall_end:		
Index: head-2008-07-15/arch/x86/kernel/Makefile
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/Makefile	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/Makefile	2008-07-15 14:24:17.000000000 +0200
@@ -96,7 +96,7 @@ obj-$(CONFIG_X86_XEN)		+= fixup.o
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-        obj-y				+= genapic_64.o genapic_flat_64.o genx2apic_uv_x.o
+        obj-$(CONFIG_X86_LOCAL_APIC)	+= genapic_64.o genapic_flat_64.o genx2apic_uv_x.o
         obj-$(CONFIG_X86_XEN_GENAPIC)	+= genapic_64.o genapic_xen_64.o
         obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
         obj-$(CONFIG_AUDIT)		+= audit_64.o
@@ -111,5 +111,6 @@ ifeq ($(CONFIG_X86_64),y)
 	pci-dma_64-$(CONFIG_XEN)	+= pci-dma_32.o
 endif
 
-disabled-obj-$(CONFIG_XEN) := i8253.o i8259_$(BITS).o reboot.o smpboot_$(BITS).o tsc_$(BITS).o
+disabled-obj-$(CONFIG_XEN) := early-quirks.o i8253.o i8259_$(BITS).o reboot.o \
+	smpboot_$(BITS).o tsc_$(BITS).o
 %/head_$(BITS).o %/head_$(BITS).s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
Index: head-2008-07-15/arch/x86/kernel/apic_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/apic_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/apic_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -43,7 +43,7 @@ int apic_verbosity;
  */
 void ack_bad_irq(unsigned int irq)
 {
-	printk("unexpected IRQ trap at vector %02x\n", irq);
+	printk("unexpected IRQ trap at irq %02x\n", irq);
 	/*
 	 * Currently unexpected vectors happen only on SMP and APIC.
 	 * We _must_ ack these because every local APIC has only N
@@ -62,19 +62,19 @@ int setup_profiling_timer(unsigned int m
 	return -EINVAL;
 }
 
-void smp_local_timer_interrupt(struct pt_regs *regs)
+void smp_local_timer_interrupt(void)
 {
-	profile_tick(CPU_PROFILING, regs);
+	profile_tick(CPU_PROFILING);
 #ifndef CONFIG_XEN
 #ifdef CONFIG_SMP
-		update_process_times(user_mode(regs));
+	update_process_times(user_mode(get_irq_regs()));
 #endif
 #endif
 	/*
 	 * We take the 'long' return path, and there every subsystem
 	 * grabs the appropriate locks (kernel lock/ irq lock).
 	 *
-	 * we might want to decouple profiling from the 'long path',
+	 * We might want to decouple profiling from the 'long path',
 	 * and do the profiling totally in assembly.
 	 *
 	 * Currently this isn't too much of an issue (performance wise),
@@ -92,6 +92,8 @@ void smp_local_timer_interrupt(struct pt
  */
 void smp_apic_timer_interrupt(struct pt_regs *regs)
 {
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
 	/*
 	 * the NMI deadlock-detector uses this.
 	 */
@@ -109,8 +111,9 @@ void smp_apic_timer_interrupt(struct pt_
 	 */
 	exit_idle();
 	irq_enter();
-	smp_local_timer_interrupt(regs);
+	smp_local_timer_interrupt();
 	irq_exit();
+	set_irq_regs(old_regs);
 }
 
 /*
@@ -188,9 +191,8 @@ int disable_apic;
 int __init APIC_init_uniprocessor (void)
 {
 #ifdef CONFIG_X86_IO_APIC
-	if (smp_found_config)
-		if (!skip_ioapic_setup && nr_ioapics)
-			setup_IO_APIC();
+	if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+		setup_IO_APIC();
 #endif
 
 	return 1;
Index: head-2008-07-15/arch/x86/kernel/e820_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/e820_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/e820_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -16,6 +16,7 @@
 #include <linux/string.h>
 #include <linux/kexec.h>
 #include <linux/module.h>
+#include <linux/mm.h>
 
 #include <asm/pgtable.h>
 #include <asm/page.h>
@@ -25,6 +26,11 @@
 #include <asm/sections.h>
 #include <xen/interface/memory.h>
 
+struct e820map e820 __initdata;
+#ifdef CONFIG_XEN
+struct e820map machine_e820 __initdata;
+#endif
+
 /* 
  * PFN of last memory page.
  */
@@ -41,14 +47,10 @@ unsigned long end_pfn_map; 
 /* 
  * Last pfn which the user wants to use.
  */
-unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;  
+static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
 
 extern struct resource code_resource, data_resource;
 
-#ifdef CONFIG_XEN
-extern struct e820map machine_e820;
-#endif
-
 /* Check for some hardcoded bad areas that early boot is not allowed to touch */ 
 static inline int bad_addr(unsigned long *addrp, unsigned long size)
 { 
@@ -57,13 +59,13 @@ static inline int bad_addr(unsigned long
 #ifndef CONFIG_XEN
 	/* various gunk below that needed for SMP startup */
 	if (addr < 0x8000) { 
-		*addrp = 0x8000;
+		*addrp = PAGE_ALIGN(0x8000);
 		return 1; 
 	}
 
 	/* direct mapping tables of the kernel */
 	if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
-		*addrp = table_end << PAGE_SHIFT; 
+		*addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
 		return 1;
 	} 
 
@@ -71,23 +73,18 @@ static inline int bad_addr(unsigned long
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (LOADER_TYPE && INITRD_START && last >= INITRD_START && 
 	    addr < INITRD_START+INITRD_SIZE) { 
-		*addrp = INITRD_START + INITRD_SIZE; 
+		*addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE);
 		return 1;
 	} 
 #endif
-	/* kernel code + 640k memory hole (later should not be needed, but 
-	   be paranoid for now) */
-	if (last >= 640*1024 && addr < 1024*1024) {
-		*addrp = 1024*1024;
-		return 1;
-	}
-	if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
-		*addrp = __pa_symbol(&_end);
+	/* kernel code */
+	if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
+		*addrp = PAGE_ALIGN(__pa_symbol(&_end));
 		return 1;
 	}
 
 	if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
-		*addrp = ebda_addr + ebda_size;
+		*addrp = PAGE_ALIGN(ebda_addr + ebda_size);
 		return 1;
 	}
 
@@ -184,7 +181,7 @@ unsigned long __init find_e820_area(unsi
 			continue; 
 		while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
 			;
-		last = addr + size;
+		last = PAGE_ALIGN(addr) + size;
 		if (last > ei->addr + ei->size)
 			continue;
 		if (last > end) 
@@ -194,59 +191,14 @@ unsigned long __init find_e820_area(unsi
 	return -1UL;		
 } 
 
-/* 
- * Free bootmem based on the e820 table for a node.
- */
-void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
-{
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i]; 
-		unsigned long last, addr;
-
-		if (ei->type != E820_RAM || 
-		    ei->addr+ei->size <= start || 
-		    ei->addr >= end)
-			continue;
-
-		addr = round_up(ei->addr, PAGE_SIZE);
-		if (addr < start) 
-			addr = start;
-
-		last = round_down(ei->addr + ei->size, PAGE_SIZE); 
-		if (last >= end)
-			last = end; 
-
-		if (last > addr && last-addr >= PAGE_SIZE)
-			free_bootmem_node(pgdat, addr, last-addr);
-	}
-}
-
 /*
  * Find the highest page frame number we have available
  */
 unsigned long __init e820_end_of_ram(void)
 {
-	int i;
 	unsigned long end_pfn = 0;
+	end_pfn = find_max_pfn_with_active_regions();
 	
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i]; 
-		unsigned long start, end;
-
-		start = round_up(ei->addr, PAGE_SIZE); 
-		end = round_down(ei->addr + ei->size, PAGE_SIZE); 
-		if (start >= end)
-			continue;
-		if (ei->type == E820_RAM) { 
-		if (end > end_pfn<<PAGE_SHIFT)
-			end_pfn = end>>PAGE_SHIFT;
-		} else { 
-			if (end > end_pfn_map<<PAGE_SHIFT) 
-				end_pfn_map = end>>PAGE_SHIFT;
-		} 
-	}
-
 	if (end_pfn > end_pfn_map) 
 		end_pfn_map = end_pfn;
 	if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
@@ -256,43 +208,10 @@ unsigned long __init e820_end_of_ram(voi
 	if (end_pfn > end_pfn_map) 
 		end_pfn = end_pfn_map; 
 
+	printk("end_pfn_map = %lu\n", end_pfn_map);
 	return end_pfn;	
 }
 
-/* 
- * Compute how much memory is missing in a range.
- * Unlike the other functions in this file the arguments are in page numbers.
- */
-unsigned long __init
-e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
-{
-	unsigned long ram = 0;
-	unsigned long start = start_pfn << PAGE_SHIFT;
-	unsigned long end = end_pfn << PAGE_SHIFT;
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long last, addr;
-
-		if (ei->type != E820_RAM ||
-		    ei->addr+ei->size <= start ||
-		    ei->addr >= end)
-			continue;
-
-		addr = round_up(ei->addr, PAGE_SIZE);
-		if (addr < start)
-			addr = start;
-
-		last = round_down(ei->addr + ei->size, PAGE_SIZE);
-		if (last >= end)
-			last = end;
-
-		if (last > addr)
-			ram += last - addr;
-	}
-	return ((end - start) - ram) >> PAGE_SHIFT;
-}
-
 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
@@ -333,6 +252,98 @@ void __init e820_reserve_resources(struc
 	}
 }
 
+#ifndef CONFIG_XEN
+/* Mark pages corresponding to given address range as nosave */
+static void __init
+e820_mark_nosave_range(unsigned long start, unsigned long end)
+{
+	unsigned long pfn, max_pfn;
+
+	if (start >= end)
+		return;
+
+	printk("Nosave address range: %016lx - %016lx\n", start, end);
+	max_pfn = end >> PAGE_SHIFT;
+	for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
+		if (pfn_valid(pfn))
+			SetPageNosave(pfn_to_page(pfn));
+}
+
+/*
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for software
+ * suspend and suspend to RAM.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+	int i;
+	unsigned long paddr;
+
+	paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
+	for (i = 1; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (paddr < ei->addr)
+			e820_mark_nosave_range(paddr,
+					round_up(ei->addr, PAGE_SIZE));
+
+		paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
+		if (ei->type != E820_RAM)
+			e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
+					paddr);
+
+		if (paddr >= (end_pfn << PAGE_SHIFT))
+			break;
+	}
+}
+#endif
+
+/* Walk the e820 map and register active regions within a node */
+void __init
+e820_register_active_regions(int nid, unsigned long start_pfn,
+							unsigned long end_pfn)
+{
+	int i;
+	unsigned long ei_startpfn, ei_endpfn;
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
+		ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
+								>> PAGE_SHIFT;
+
+		/* Skip map entries smaller than a page */
+		if (ei_startpfn >= ei_endpfn)
+			continue;
+
+		/* Check if end_pfn_map should be updated */
+		if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
+			end_pfn_map = ei_endpfn;
+
+		/* Skip if map is outside the node */
+		if (ei->type != E820_RAM ||
+				ei_endpfn <= start_pfn ||
+				ei_startpfn >= end_pfn)
+			continue;
+
+		/* Check for overlaps */
+		if (ei_startpfn < start_pfn)
+			ei_startpfn = start_pfn;
+		if (ei_endpfn > end_pfn)
+			ei_endpfn = end_pfn;
+
+		/* Obey end_user_pfn to save on memmap */
+		if (ei_startpfn >= end_user_pfn)
+			continue;
+		if (ei_endpfn > end_user_pfn)
+			ei_endpfn = end_user_pfn;
+
+		add_active_range(nid, ei_startpfn, ei_endpfn);
+	}
+}
+
 /* 
  * Add a memory region to the kernel e820 map.
  */ 
@@ -553,13 +564,6 @@ static int __init sanitize_e820_map(stru
  * If we're lucky and live on a modern system, the setup code
  * will have given us a memory map that we can use to properly
  * set up memory.  If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
  */
 static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
 {
@@ -581,27 +585,6 @@ static int __init copy_e820_map(struct e
 		if (start > end)
 			return -1;
 
-#ifndef CONFIG_XEN
-		/*
-		 * Some BIOSes claim RAM in the 640k - 1M region.
-		 * Not right. Fix it up.
-		 * 
-		 * This should be removed on Hammer which is supposed to not
-		 * have non e820 covered ISA mappings there, but I had some strange
-		 * problems so it stays for now.  -AK
-		 */
-		if (type == E820_RAM) {
-			if (start < 0x100000ULL && end > 0xA0000ULL) {
-				if (start < 0xA0000ULL)
-					add_memory_region(start, 0xA0000ULL-start, type);
-				if (end <= 0x100000ULL)
-					continue;
-				start = 0x100000ULL;
-				size = end - start;
-			}
-		}
-#endif
-
 		add_memory_region(start, size, type);
 	} while (biosmap++,--nr_map);
 
@@ -622,11 +605,15 @@ static int __init copy_e820_map(struct e
 	return 0;
 }
 
+void early_panic(char *msg)
+{
+	early_printk(msg);
+	panic(msg);
+}
+
 #ifndef CONFIG_XEN
 void __init setup_memory_region(void)
 {
-	char *who = "BIOS-e820";
-
 	/*
 	 * Try to copy the BIOS-supplied E820-map.
 	 *
@@ -634,24 +621,10 @@ void __init setup_memory_region(void)
 	 * the next section from 1mb->appropriate_mem_k
 	 */
 	sanitize_e820_map(E820_MAP, &E820_MAP_NR);
-	if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
-		unsigned long mem_size;
-
-		/* compare results from other methods and take the greater */
-		if (ALT_MEM_K < EXT_MEM_K) {
-			mem_size = EXT_MEM_K;
-			who = "BIOS-88";
-		} else {
-			mem_size = ALT_MEM_K;
-			who = "BIOS-e801";
-		}
-
-		e820.nr_map = 0;
-		add_memory_region(0, LOWMEMSIZE(), E820_RAM);
-		add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
-  	}
+	if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
+		early_panic("Cannot find a valid memory map");
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	e820_print_map(who);
+	e820_print_map("BIOS-e820");
 }
 
 #else  /* CONFIG_XEN */
@@ -683,20 +656,23 @@ void __init setup_memory_region(void)
 
 	sanitize_e820_map(map, (char *)&memmap.nr_entries);
 
-	BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
+	if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
+		early_panic("Cannot find a valid memory map");
 
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
 	e820_print_map("Xen");
 }
 #endif
 
-void __init parse_memopt(char *p, char **from) 
-{ 
+static int __init parse_memopt(char *p)
+{
 	int i;
 	unsigned long current_end;
 	unsigned long end;
 
-	end_user_pfn = memparse(p, from);
+	if (!p)
+		return -EINVAL;
+	end_user_pfn = memparse(p, &p);
 	end_user_pfn >>= PAGE_SHIFT;	
 
 	end = end_user_pfn<<PAGE_SHIFT;
@@ -713,27 +689,61 @@ void __init parse_memopt(char *p, char *
 		else
 			add_memory_region(current_end, end - current_end, E820_RAM);
 	}
+
+	return 0;
 } 
+early_param("mem", parse_memopt);
+
+static int userdef __initdata;
 
-void __init parse_memmapopt(char *p, char **from)
+static int __init parse_memmap_opt(char *p)
 {
+	char *oldp;
 	unsigned long long start_at, mem_size;
 
-	mem_size = memparse(p, from);
-	p = *from;
+	if (!strcmp(p, "exactmap")) {
+#ifdef CONFIG_CRASH_DUMP
+		/* If we are doing a crash dump, we
+		 * still need to know the real mem
+		 * size before original memory map is
+		 * reset.
+		 */
+		e820_register_active_regions(0, 0, -1UL);
+		saved_max_pfn = e820_end_of_ram();
+		remove_all_active_ranges();
+#endif
+		end_pfn_map = 0;
+		e820.nr_map = 0;
+		userdef = 1;
+		return 0;
+	}
+
+	oldp = p;
+	mem_size = memparse(p, &p);
+	if (p == oldp)
+		return -EINVAL;
 	if (*p == '@') {
-		start_at = memparse(p+1, from);
+		start_at = memparse(p+1, &p);
 		add_memory_region(start_at, mem_size, E820_RAM);
 	} else if (*p == '#') {
-		start_at = memparse(p+1, from);
+		start_at = memparse(p+1, &p);
 		add_memory_region(start_at, mem_size, E820_ACPI);
 	} else if (*p == '$') {
-		start_at = memparse(p+1, from);
+		start_at = memparse(p+1, &p);
 		add_memory_region(start_at, mem_size, E820_RESERVED);
 	} else {
 		end_user_pfn = (mem_size >> PAGE_SHIFT);
 	}
-	p = *from;
+	return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+
+void finish_e820_parsing(void)
+{
+	if (userdef) {
+		printk(KERN_INFO "user-defined physical RAM map:\n");
+		e820_print_map("user");
+	}
 }
 
 unsigned long pci_mem_start = 0xaeedbabe;
Index: head-2008-07-15/arch/x86/kernel/early_printk-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/early_printk-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/early_printk-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -244,20 +244,16 @@ void early_printk(const char *fmt, ...)
 
 static int __initdata keep_early;
 
-int __init setup_early_printk(char *opt)
+static int __init setup_early_printk(char *buf)
 {
-	char *space;
-	char buf[256];
+	if (!buf)
+		return 0;
 
 	if (early_console_initialized)
-		return 1;
-
-	strlcpy(buf,opt,sizeof(buf));
-	space = strchr(buf, ' ');
-	if (space)
-		*space = 0;
+		return 0;
+	early_console_initialized = 1;
 
-	if (strstr(buf,"keep"))
+	if (strstr(buf, "keep"))
 		keep_early = 1;
 
 	if (!strncmp(buf, "serial", 6)) {
@@ -281,11 +277,12 @@ int __init setup_early_printk(char *opt)
  		early_console = &simnow_console;
  		keep_early = 1;
 	}
-	early_console_initialized = 1;
 	register_console(early_console);
 	return 0;
 }
 
+early_param("earlyprintk", setup_early_printk);
+
 void __init disable_early_printk(void)
 {
 	if (!early_console_initialized || !early_console)
@@ -299,4 +296,3 @@ void __init disable_early_printk(void)
 	}
 }
 
-__setup("earlyprintk=", setup_early_printk);
Index: head-2008-07-15/arch/x86/kernel/entry_64-xen.S
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/entry_64-xen.S	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/entry_64-xen.S	2008-07-15 14:24:17.000000000 +0200
@@ -4,9 +4,6 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
  *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
- * 
- *  $Id$
- *
  *  Jun Nakajima <jun.nakajima@intel.com>
  *  Asit Mallick <asit.k.mallick@intel.com>
  *      Modified for Xen
@@ -26,15 +23,25 @@
  * at the top of the kernel process stack.	
  * - partial stack frame: partially saved registers upto R11.
  * - full stack frame: Like partial stack frame, but all register saved. 
- *	
- * TODO:	 
- * - schedule it carefully for the final hardware.
+ *
+ * Some macro usage:
+ * - CFI macros are used to generate dwarf2 unwind information for better
+ * backtraces. They don't change any code.
+ * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
+ * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
+ * There are unfortunately lots of special cases where some registers
+ * not touched. The macro is a big mess that should be cleaned up.
+ * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
+ * Gives a full stack frame.
+ * - ENTRY/END Define functions in the symbol table.
+ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
+ * frame that is otherwise undefined after a SYSCALL
+ * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
+ * - errorentry/paranoidentry/zeroentry - Define exception entry points.
  */
 
-#define ASSEMBLY 1
 #include <linux/linkage.h>
 #include <asm/segment.h>
-#include <asm/smp.h>
 #include <asm/cache.h>
 #include <asm/errno.h>
 #include <asm/dwarf2.h>
@@ -117,6 +124,7 @@ NMI_MASK = 0x80000000
 	.macro	CFI_DEFAULT_STACK start=1,adj=0
 	.if \start
 	CFI_STARTPROC	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,SS+8-(\adj*ARGOFFSET)
 	.else
 	CFI_DEF_CFA_OFFSET SS+8-(\adj*ARGOFFSET)
@@ -207,6 +215,7 @@ END(ret_from_fork)
  */
 	.macro _frame ref
 	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA rsp,SS+8-\ref
 	/*CFI_REL_OFFSET ss,SS-\ref*/
 	CFI_REL_OFFSET rsp,RSP-\ref
@@ -334,6 +343,8 @@ tracesys:			 
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	cmpq $__NR_syscall_max,%rax
+	movq $-ENOSYS,%rcx
+	cmova %rcx,%rax
 	ja  1f
 	movq %r10,%rcx	/* fixup for C */
 	call *sys_call_table(,%rax,8)
@@ -349,6 +360,7 @@ END(system_call)
  */ 	
 ENTRY(int_ret_from_sys_call)
 	CFI_STARTPROC	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,SS+8-ARGOFFSET
 	/*CFI_REL_OFFSET	ss,SS-ARGOFFSET*/
 	CFI_REL_OFFSET	rsp,RSP-ARGOFFSET
@@ -583,8 +595,7 @@ retint_signal:
 #ifdef CONFIG_PREEMPT
 	/* Returning to kernel space. Check if we need preemption */
 	/* rcx:	 threadinfo. interrupts off. */
-	.p2align
-retint_kernel:	
+ENTRY(retint_kernel)
 	cmpl $0,threadinfo_preempt_count(%rcx)
 	jnz  retint_restore_args
 	bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
@@ -644,7 +655,6 @@ ENTRY(call_function_interrupt)
 END(call_function_interrupt)
 #endif
 
-#ifdef CONFIG_X86_LOCAL_APIC	
 ENTRY(apic_timer_interrupt)
 	apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
 END(apic_timer_interrupt)
@@ -656,7 +666,6 @@ END(error_interrupt)
 ENTRY(spurious_interrupt)
 	apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
 END(spurious_interrupt)
-#endif
 #endif /* !CONFIG_XEN */
 				
 /*
@@ -755,7 +764,9 @@ paranoid_exit\trace:
 	testl $3,CS(%rsp)
 	jnz   paranoid_userspace\trace
 paranoid_swapgs\trace:
+	.if \trace
 	TRACE_IRQS_IRETQ 0
+	.endif
 	swapgs
 paranoid_restore\trace:
 	RESTORE_ALL 8
@@ -802,7 +813,7 @@ paranoid_schedule\trace:
  * Exception entry point. This expects an error code/orig_rax on the stack
  * and the exception handler in %rax.	
  */ 		  				
-ENTRY(error_entry)
+KPROBE_ENTRY(error_entry)
 	_frame RDI
 	CFI_REL_OFFSET rax,0
 	/* rdi slot contains rax, oldrax contains error code */
@@ -896,7 +907,7 @@ error_kernelspace:
 	jmp  error_sti
 #endif
 	CFI_ENDPROC
-END(error_entry)
+KPROBE_END(error_entry)
 	
 ENTRY(hypervisor_callback)
 	zeroentry do_hypervisor_callback
@@ -936,26 +947,6 @@ ENTRY(do_hypervisor_callback)   # do_hyp
 	CFI_ENDPROC
 END(do_hypervisor_callback)
 
-#ifdef CONFIG_X86_LOCAL_APIC
-KPROBE_ENTRY(nmi)
-	zeroentry do_nmi_callback
-ENTRY(do_nmi_callback)
-	CFI_STARTPROC
-        addq $8, %rsp
-	CFI_ENDPROC
-	CFI_DEFAULT_STACK
-        call do_nmi
-        orl  $NMI_MASK,EFLAGS(%rsp)
-        RESTORE_REST
-        XEN_BLOCK_EVENTS(%rsi)
-	TRACE_IRQS_OFF
-        GET_THREAD_INFO(%rcx)
-        jmp  retint_restore_args
-	CFI_ENDPROC
-	.previous .text
-END(nmi)
-#endif
-
         ALIGN
 restore_all_enable_events:  
 	CFI_DEFAULT_STACK adj=1
@@ -1121,7 +1112,7 @@ ENDPROC(child_rip)
  * do_sys_execve asm fallback arguments:
  *	rdi: name, rsi: argv, rdx: envp, fake frame on the stack
  */
-ENTRY(execve)
+ENTRY(kernel_execve)
 	CFI_STARTPROC
 	FAKE_STACK_FRAME $0
 	SAVE_ALL	
@@ -1135,12 +1126,11 @@ ENTRY(execve)
 	UNFAKE_STACK_FRAME
 	ret
 	CFI_ENDPROC
-ENDPROC(execve)
+ENDPROC(kernel_execve)
 
 KPROBE_ENTRY(page_fault)
 	errorentry do_page_fault
-END(page_fault)
-	.previous .text
+KPROBE_END(page_fault)
 
 ENTRY(coprocessor_error)
 	zeroentry do_coprocessor_error
@@ -1162,25 +1152,25 @@ KPROBE_ENTRY(debug)
 	zeroentry do_debug
 /*	paranoidexit
 	CFI_ENDPROC */
-END(debug)
-	.previous .text
+KPROBE_END(debug)
 
-#if 0
-	/* runs on exception stack */	
 KPROBE_ENTRY(nmi)
-	INTR_FRAME
-	pushq $-1
-	CFI_ADJUST_CFA_OFFSET 8
-	paranoidentry do_nmi, 0, 0
-#ifdef CONFIG_TRACE_IRQFLAGS
-	paranoidexit 0
-#else
-	jmp paranoid_exit1
- 	CFI_ENDPROC
-#endif
-END(nmi)
-	.previous .text
-#endif        
+	zeroentry do_nmi_callback
+KPROBE_END(nmi)
+do_nmi_callback:
+	CFI_STARTPROC
+	addq $8, %rsp
+	CFI_ENDPROC
+	CFI_DEFAULT_STACK
+	call do_nmi
+	orl  $NMI_MASK,EFLAGS(%rsp)
+	RESTORE_REST
+	XEN_BLOCK_EVENTS(%rsi)
+	TRACE_IRQS_OFF
+	GET_THREAD_INFO(%rcx)
+	jmp  retint_restore_args
+	CFI_ENDPROC
+END(do_nmi_callback)
 
 KPROBE_ENTRY(int3)
 /* 	INTR_FRAME
@@ -1189,8 +1179,7 @@ KPROBE_ENTRY(int3)
  	zeroentry do_int3
 /* 	jmp paranoid_exit1
  	CFI_ENDPROC */
-END(int3)
-	.previous .text
+KPROBE_END(int3)
 
 ENTRY(overflow)
 	zeroentry do_overflow
@@ -1241,8 +1230,7 @@ END(stack_segment)
 
 KPROBE_ENTRY(general_protection)
 	errorentry do_general_protection
-END(general_protection)
-	.previous .text
+KPROBE_END(general_protection)
 
 ENTRY(alignment_check)
 	errorentry do_alignment_check
Index: head-2008-07-15/arch/x86/kernel/genapic_xen_64.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/genapic_xen_64.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/genapic_xen_64.c	2008-07-15 14:24:17.000000000 +0200
@@ -71,6 +71,13 @@ static cpumask_t xen_target_cpus(void)
 	return cpu_online_map;
 }
 
+static cpumask_t xen_vector_allocation_domain(int cpu)
+{
+	cpumask_t domain = CPU_MASK_NONE;
+	cpu_set(cpu, domain);
+	return domain;
+}
+
 /*
  * Set up the logical destination ID.
  * Do nothing, not called now.
@@ -147,8 +154,8 @@ struct genapic apic_xen =  {
 	.int_delivery_mode = dest_LowestPrio,
 #endif
 	.int_dest_mode = (APIC_DEST_LOGICAL != 0),
-	.int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
 	.target_cpus = xen_target_cpus,
+	.vector_allocation_domain = xen_vector_allocation_domain,
 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
 	.apic_id_registered = xen_apic_id_registered,
 #endif
Index: head-2008-07-15/arch/x86/kernel/head_64-xen.S
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/head_64-xen.S	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/head_64-xen.S	2008-07-15 14:24:17.000000000 +0200
@@ -5,9 +5,6 @@
  *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
  *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
  *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
- *
- *  $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
- *
  *  Jun Nakajima <jun.nakajima@intel.com>
  *    Modified for Xen                                
  */
@@ -149,7 +146,7 @@ ENTRY(cpu_gdt_table)
 	.quad	0,0			/* TSS */
 	.quad	0,0			/* LDT */
 	.quad   0,0,0			/* three TLS descriptors */
-	.quad	0			/* unused */
+	.quad	0x0000f40000000000	/* node/CPU stored in limit */
 gdt_end:
 	/* asm/segment.h:GDT_ENTRIES must match this */
 	/* This should be a multiple of the cache line size */
Index: head-2008-07-15/arch/x86/kernel/head64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/head64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/head64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -54,11 +54,9 @@ static void __init copy_bootdata(char *r
 	new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
 	if (!new_data) {
 		if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
-			printk("so old bootloader that it does not support commandline?!\n");
 			return;
 		}
 		new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
-		printk("old bootloader convention, maybe loadlin?\n");
 	}
 	command_line = (char *) ((u64)(new_data));
 	memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
@@ -70,25 +68,6 @@ static void __init copy_bootdata(char *r
 	memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
 	saved_command_line[max_cmdline-1] = '\0';
 #endif
-	printk("Bootdata ok (command line is %s)\n", saved_command_line);
-}
-
-static void __init setup_boot_cpu_data(void)
-{
-	unsigned int dummy, eax;
-
-	/* get vendor info */
-	cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
-	      (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
-	      (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
-	      (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
-
-	/* get cpu type */
-	cpuid(1, &eax, &dummy, &dummy,
-		(unsigned int *) &boot_cpu_data.x86_capability);
-	boot_cpu_data.x86 = (eax >> 8) & 0xf;
-	boot_cpu_data.x86_model = (eax >> 4) & 0xf;
-	boot_cpu_data.x86_mask = eax & 0xf;
 }
 
 #include <xen/interface/memory.h>
@@ -101,7 +80,6 @@ void __init x86_64_start_kernel(char * r
 {
 	struct xen_machphys_mapping mapping;
 	unsigned long machine_to_phys_nr_ents;
-	char *s;
 	int i;
 
 	setup_xen_features();
@@ -128,10 +106,7 @@ void __init x86_64_start_kernel(char * r
 	asm volatile("lidt %0" :: "m" (idt_descr));
 #endif
 
-	/*
-	 * This must be called really, really early:
-	 */
-	lockdep_init();
+	early_printk("Kernel alive\n");
 
  	for (i = 0; i < NR_CPUS; i++)
  		cpu_pda(i) = &boot_cpu_pda[i];
@@ -141,22 +116,5 @@ void __init x86_64_start_kernel(char * r
 #ifdef CONFIG_SMP
 	cpu_set(0, cpu_online_map);
 #endif
-	s = strstr(saved_command_line, "earlyprintk=");
-	if (s != NULL)
-		setup_early_printk(strchr(s, '=') + 1);
-#ifdef CONFIG_NUMA
-	s = strstr(saved_command_line, "numa=");
-	if (s != NULL)
-		numa_setup(s+5);
-#endif
-#ifdef CONFIG_X86_IO_APIC
-	if (strstr(saved_command_line, "disableapic"))
-		disable_apic = 1;
-#endif
-	/* You need early console to see that */
-	if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
-		panic("Kernel too big for kernel mapping\n");
-
-	setup_boot_cpu_data();
 	start_kernel();
 }
Index: head-2008-07-15/arch/x86/kernel/io_apic_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/io_apic_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/io_apic_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -26,9 +26,12 @@
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
+#include <linux/pci.h>
 #include <linux/mc146818rtc.h>
 #include <linux/acpi.h>
 #include <linux/sysdev.h>
+#include <linux/msi.h>
+#include <linux/htirq.h>
 #ifdef CONFIG_ACPI
 #include <acpi/acpi_bus.h>
 #endif
@@ -41,6 +44,10 @@
 #include <asm/acpi.h>
 #include <asm/dma.h>
 #include <asm/nmi.h>
+#include <asm/msidef.h>
+#include <asm/hypertransport.h>
+
+static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result);
 
 #define __apicdebuginit  __init
 
@@ -48,17 +55,29 @@ int sis_apic_bug; /* not actually suppor
 
 static int no_timer_check;
 
-int disable_timer_pin_1 __initdata;
+static int disable_timer_pin_1 __initdata;
 
-#ifndef CONFIG_XEN
-int timer_over_8254 __initdata = 0;
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+
+/* Fake i8259 */
+#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
+#define disable_8259A_irq(_irq)  ((void)0)
+#define i8259A_irq_pending(_irq) (0)
+
+unsigned long io_apic_irqs;
+
+#define clear_IO_APIC() ((void)0)
+#else
+int timer_over_8254 __initdata = 1;
 
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 #endif
 
 static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
+DEFINE_SPINLOCK(vector_lock);
 
 /*
  * # of IRQ routing registers
@@ -83,28 +102,27 @@ static struct irq_pin_list {
 	short apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
-#ifdef CONFIG_PCI_MSI
-#define vector_to_irq(vector) 	\
-	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
-#else
-#define vector_to_irq(vector)	(vector)
-#endif
-
-#ifdef CONFIG_XEN
-
-#include <xen/interface/xen.h>
-#include <xen/interface/physdev.h>
-
-/* Fake i8259 */
-#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
-#define disable_8259A_irq(_irq)  ((void)0)
-#define i8259A_irq_pending(_irq) (0)
+#ifndef CONFIG_XEN
+struct io_apic {
+	unsigned int index;
+	unsigned int unused[3];
+	unsigned int data;
+};
 
-unsigned long io_apic_irqs;
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+}
+#endif
 
-static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
 {
+#ifndef CONFIG_XEN
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(reg, &io_apic->index);
+	return readl(&io_apic->data);
+#else
 	struct physdev_apic apic_op;
 	int ret;
 
@@ -114,31 +132,131 @@ static inline unsigned int xen_io_apic_r
 	if (ret)
 		return ret;
 	return apic_op.value;
+#endif
 }
 
-static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
 {
+#ifndef CONFIG_XEN
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(reg, &io_apic->index);
+	writel(value, &io_apic->data);
+#else
 	struct physdev_apic apic_op;
 
 	apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
 	apic_op.reg = reg;
 	apic_op.value = value;
 	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
+#endif
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ */
+static inline void io_apic_modify(unsigned int apic, unsigned int value)
+{
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(value, &io_apic->data);
 }
+#else
+#define io_apic_modify io_apic_write
+#endif
 
-#define io_apic_read(a,r)    xen_io_apic_read(a,r)
-#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
+/*
+ * Synchronize the IO-APIC and the CPU by doing
+ * a dummy read from the IO-APIC
+ */
+static inline void io_apic_sync(unsigned int apic)
+{
+#ifndef CONFIG_XEN
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	readl(&io_apic->data);
+#endif
+}
 
-#define clear_IO_APIC() ((void)0)
+union entry_union {
+	struct { u32 w1, w2; };
+	struct IO_APIC_route_entry entry;
+};
 
-#else
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+	union entry_union eu;
+	unsigned long flags;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+	eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+	return eu.entry;
+}
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+	unsigned long flags;
+	union entry_union eu;
+	eu.entry = e;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+	unsigned long flags;
+	union entry_union eu = { .entry.mask = 1 };
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
 
 #ifdef CONFIG_SMP
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+	int apic, pin;
+	struct irq_pin_list *entry = irq_2_pin + irq;
+
+	BUG_ON(irq >= NR_IRQS);
+	for (;;) {
+		unsigned int reg;
+		apic = entry->apic;
+		pin = entry->pin;
+		if (pin == -1)
+			break;
+		io_apic_write(apic, 0x11 + pin*2, dest);
+		reg = io_apic_read(apic, 0x10 + pin*2);
+		reg &= ~0x000000ff;
+		reg |= vector;
+		io_apic_modify(apic, reg);
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+}
+
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 {
 	unsigned long flags;
 	unsigned int dest;
 	cpumask_t tmp;
+	int vector;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -146,7 +264,11 @@ static void set_ioapic_affinity_irq(unsi
 
 	cpus_and(mask, tmp, CPU_MASK_ALL);
 
-	dest = cpu_mask_to_apicid(mask);
+	vector = assign_irq_vector(irq, mask, &tmp);
+	if (vector < 0)
+		return;
+
+	dest = cpu_mask_to_apicid(tmp);
 
 	/*
 	 * Only the high 8 bits are valid.
@@ -154,13 +276,12 @@ static void set_ioapic_affinity_irq(unsi
 	dest = SET_APIC_LOGICAL_ID(dest);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__DO_ACTION(1, = dest, )
-	set_irq_info(irq, mask);
+	__target_IO_APIC_irq(irq, dest, vector);
+	set_native_irq_info(irq, mask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 #endif
-
-#endif /* !CONFIG_XEN */
+#endif
 
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
@@ -240,24 +361,15 @@ static void unmask_IO_APIC_irq (unsigned
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
-	unsigned long flags;
 
 	/* Check delivery_mode to be sure we're not clearing an SMI pin */
-	spin_lock_irqsave(&ioapic_lock, flags);
-	*(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
-	*(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	entry = ioapic_read_entry(apic, pin);
 	if (entry.delivery_mode == dest_SMI)
 		return;
 	/*
 	 * Disable it in the IO-APIC irq-routing table:
 	 */
-	memset(&entry, 0, sizeof(entry));
-	entry.mask = 1;
-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
-	io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_mask_entry(apic, pin);
 }
 
 static void clear_IO_APIC (void)
@@ -271,16 +383,6 @@ static void clear_IO_APIC (void)
 
 #endif /* !CONFIG_XEN */
 
-static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
-
-/*
- * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
- * specific CPU-side IRQs.
- */
-
-#define MAX_PIRQS 8
-static int pirq_entries [MAX_PIRQS];
-static int pirqs_enabled;
 int skip_ioapic_setup;
 int ioapic_force;
 
@@ -289,18 +391,17 @@ int ioapic_force;
 static int __init disable_ioapic_setup(char *str)
 {
 	skip_ioapic_setup = 1;
-	return 1;
+	return 0;
 }
+early_param("noapic", disable_ioapic_setup);
 
-static int __init enable_ioapic_setup(char *str)
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
 {
-	ioapic_force = 1;
-	skip_ioapic_setup = 0;
+	disable_timer_pin_1 = 1;
 	return 1;
 }
-
-__setup("noapic", disable_ioapic_setup);
-__setup("apic", enable_ioapic_setup);
+__setup("disable_timer_pin_1", disable_timer_pin_setup);
 
 #ifndef CONFIG_XEN
 static int __init setup_disable_8254_timer(char *s)
@@ -318,137 +419,6 @@ __setup("disable_8254_timer", setup_disa
 __setup("enable_8254_timer", setup_enable_8254_timer);
 #endif /* !CONFIG_XEN */
 
-#include <asm/pci-direct.h>
-#include <linux/pci_ids.h>
-#include <linux/pci.h>
-
-
-#ifdef CONFIG_ACPI
-
-static int nvidia_hpet_detected __initdata;
-
-static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
-{
-	nvidia_hpet_detected = 1;
-	return 0;
-}
-#endif
-
-/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
-   off. Check for an Nvidia or VIA PCI bridge and turn it off.
-   Use pci direct infrastructure because this runs before the PCI subsystem. 
-
-   Can be overwritten with "apic"
-
-   And another hack to disable the IOMMU on VIA chipsets.
-
-   ... and others. Really should move this somewhere else.
-
-   Kludge-O-Rama. */
-void __init check_ioapic(void) 
-{ 
-	int num,slot,func; 
-	/* Poor man's PCI discovery */
-	for (num = 0; num < 32; num++) { 
-		for (slot = 0; slot < 32; slot++) { 
-			for (func = 0; func < 8; func++) { 
-				u32 class;
-				u32 vendor;
-				u8 type;
-				class = read_pci_config(num,slot,func,
-							PCI_CLASS_REVISION);
-				if (class == 0xffffffff)
-					break; 
-
-		       		if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
-					continue; 
-
-				vendor = read_pci_config(num, slot, func, 
-							 PCI_VENDOR_ID);
-				vendor &= 0xffff;
-				switch (vendor) { 
-				case PCI_VENDOR_ID_VIA:
-#ifdef CONFIG_IOMMU
-					if ((end_pfn > MAX_DMA32_PFN ||
-					     force_iommu) &&
-					    !iommu_aperture_allowed) {
-						printk(KERN_INFO
-    "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
-						iommu_aperture_disabled = 1;
-					}
-#endif
-					return;
-				case PCI_VENDOR_ID_NVIDIA:
-#ifdef CONFIG_ACPI
-					/*
-					 * All timer overrides on Nvidia are
-					 * wrong unless HPET is enabled.
-					 */
-					nvidia_hpet_detected = 0;
-					acpi_table_parse(ACPI_HPET,
-							nvidia_hpet_check);
-					if (nvidia_hpet_detected == 0) {
-						acpi_skip_timer_override = 1;
-						printk(KERN_INFO "Nvidia board "
-						    "detected. Ignoring ACPI "
-						    "timer override.\n");
-					}
-#endif
-					/* RED-PEN skip them on mptables too? */
-					return;
-				case PCI_VENDOR_ID_ATI:
-
-				/* This should be actually default, but
-				   for 2.6.16 let's do it for ATI only where
-				   it's really needed. */
-#ifndef CONFIG_XEN
-					if (timer_over_8254 == 1) {	
-						timer_over_8254 = 0;	
-					printk(KERN_INFO
-		"ATI board detected. Disabling timer routing over 8254.\n");
-					}	
-#endif
-					return;
-				} 
-
-
-				/* No multi-function device? */
-				type = read_pci_config_byte(num,slot,func,
-							    PCI_HEADER_TYPE);
-				if (!(type & 0x80))
-					break;
-			} 
-		}
-	}
-} 
-
-static int __init ioapic_pirq_setup(char *str)
-{
-	int i, max;
-	int ints[MAX_PIRQS+1];
-
-	get_options(str, ARRAY_SIZE(ints), ints);
-
-	for (i = 0; i < MAX_PIRQS; i++)
-		pirq_entries[i] = -1;
-
-	pirqs_enabled = 1;
-	apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
-	max = MAX_PIRQS;
-	if (ints[0] < MAX_PIRQS)
-		max = ints[0];
-
-	for (i = 0; i < max; i++) {
-		apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
-		/*
-		 * PIRQs are mapped upside down, usually.
-		 */
-		pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
-	}
-	return 1;
-}
-
-__setup("pirq=", ioapic_pirq_setup);
 
 /*
  * Find the IRQ entry number of a certain pin.
@@ -478,9 +448,7 @@ static int __init find_isa_irq_pin(int i
 	for (i = 0; i < mp_irq_entries; i++) {
 		int lbus = mp_irqs[i].mpc_srcbus;
 
-		if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
-		     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
-		     mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
+		if (test_bit(lbus, mp_bus_not_pci) &&
 		    (mp_irqs[i].mpc_irqtype == type) &&
 		    (mp_irqs[i].mpc_srcbusirq == irq))
 
@@ -496,9 +464,7 @@ static int __init find_isa_irq_apic(int 
 	for (i = 0; i < mp_irq_entries; i++) {
 		int lbus = mp_irqs[i].mpc_srcbus;
 
-		if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
-		     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
-		     mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
+		if (test_bit(lbus, mp_bus_not_pci) &&
 		    (mp_irqs[i].mpc_irqtype == type) &&
 		    (mp_irqs[i].mpc_srcbusirq == irq))
 			break;
@@ -539,7 +505,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, 
 			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
 				break;
 
-		if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
+		if (!test_bit(lbus, mp_bus_not_pci) &&
 		    !mp_irqs[i].mpc_irqtype &&
 		    (bus == lbus) &&
 		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
@@ -562,27 +528,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, 
 	return best_guess;
 }
 
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
-	if (irq < 16) {
-		unsigned int port = 0x4d0 + (irq >> 3);
-		return (inb(port) >> (irq & 7)) & 1;
-	}
-	apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
-	return 0;
-}
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value.  If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
-#define default_EISA_polarity(idx)	(0)
-
 /* ISA interrupts are always polarity zero edge triggered,
  * when listed as conforming in the MP table. */
 
@@ -595,12 +540,6 @@ static int EISA_ELCR(unsigned int irq)
 #define default_PCI_trigger(idx)	(1)
 #define default_PCI_polarity(idx)	(1)
 
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx)	(1)
-#define default_MCA_polarity(idx)	(0)
-
 static int __init MPBIOS_polarity(int idx)
 {
 	int bus = mp_irqs[idx].mpc_srcbus;
@@ -612,38 +551,11 @@ static int __init MPBIOS_polarity(int id
 	switch (mp_irqs[idx].mpc_irqflag & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent polarity */
-		{
-			switch (mp_bus_id_to_type[bus])
-			{
-				case MP_BUS_ISA: /* ISA pin */
-				{
-					polarity = default_ISA_polarity(idx);
-					break;
-				}
-				case MP_BUS_EISA: /* EISA pin */
-				{
-					polarity = default_EISA_polarity(idx);
-					break;
-				}
-				case MP_BUS_PCI: /* PCI pin */
-				{
-					polarity = default_PCI_polarity(idx);
-					break;
-				}
-				case MP_BUS_MCA: /* MCA pin */
-				{
-					polarity = default_MCA_polarity(idx);
-					break;
-				}
-				default:
-				{
-					printk(KERN_WARNING "broken BIOS!!\n");
-					polarity = 1;
-					break;
-				}
-			}
+			if (test_bit(bus, mp_bus_not_pci))
+				polarity = default_ISA_polarity(idx);
+			else
+				polarity = default_PCI_polarity(idx);
 			break;
-		}
 		case 1: /* high active */
 		{
 			polarity = 0;
@@ -681,38 +593,11 @@ static int MPBIOS_trigger(int idx)
 	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent */
-		{
-			switch (mp_bus_id_to_type[bus])
-			{
-				case MP_BUS_ISA: /* ISA pin */
-				{
-					trigger = default_ISA_trigger(idx);
-					break;
-				}
-				case MP_BUS_EISA: /* EISA pin */
-				{
-					trigger = default_EISA_trigger(idx);
-					break;
-				}
-				case MP_BUS_PCI: /* PCI pin */
-				{
-					trigger = default_PCI_trigger(idx);
-					break;
-				}
-				case MP_BUS_MCA: /* MCA pin */
-				{
-					trigger = default_MCA_trigger(idx);
-					break;
-				}
-				default:
-				{
-					printk(KERN_WARNING "broken BIOS!!\n");
-					trigger = 1;
-					break;
-				}
-			}
+			if (test_bit(bus, mp_bus_not_pci))
+				trigger = default_ISA_trigger(idx);
+			else
+				trigger = default_PCI_trigger(idx);
 			break;
-		}
 		case 1: /* edge */
 		{
 			trigger = 0;
@@ -749,64 +634,6 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
-static int next_irq = 16;
-
-/*
- * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
- * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
- * from ACPI, which can reach 800 in large boxen.
- *
- * Compact the sparse GSI space into a sequential IRQ series and reuse
- * vectors if possible.
- */
-int gsi_irq_sharing(int gsi)
-{
-	int i, tries, vector;
-
-	BUG_ON(gsi >= NR_IRQ_VECTORS);
-
-	if (platform_legacy_irq(gsi))
-		return gsi;
-
-	if (gsi_2_irq[gsi] != 0xFF)
-		return (int)gsi_2_irq[gsi];
-
-	tries = NR_IRQS;
-  try_again:
-	vector = assign_irq_vector(gsi);
-
-	/*
-	 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
-	 * use of vector and if found, return that IRQ.  However, we never want
-	 * to share legacy IRQs, which usually have a different trigger mode
-	 * than PCI.
-	 */
-	for (i = 0; i < NR_IRQS; i++)
-		if (IO_APIC_VECTOR(i) == vector)
-			break;
-	if (platform_legacy_irq(i)) {
-		if (--tries >= 0) {
-			IO_APIC_VECTOR(i) = 0;
-			goto try_again;
-		}
-		panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
-	}
-	if (i < NR_IRQS) {
-		gsi_2_irq[gsi] = i;
-		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
-				gsi, vector, i);
-		return i;
-	}
-
-	i = next_irq++;
-	BUG_ON(i >= NR_IRQS);
-	gsi_2_irq[gsi] = i;
-	IO_APIC_VECTOR(i) = vector;
-	printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
-			gsi, vector, i);
-	return i;
-}
-
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
@@ -818,49 +645,16 @@ static int pin_2_irq(int idx, int apic, 
 	if (mp_irqs[idx].mpc_dstirq != pin)
 		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
 
-	switch (mp_bus_id_to_type[bus])
-	{
-		case MP_BUS_ISA: /* ISA pin */
-		case MP_BUS_EISA:
-		case MP_BUS_MCA:
-		{
-			irq = mp_irqs[idx].mpc_srcbusirq;
-			break;
-		}
-		case MP_BUS_PCI: /* PCI pin */
-		{
-			/*
-			 * PCI IRQs are mapped in order
-			 */
-			i = irq = 0;
-			while (i < apic)
-				irq += nr_ioapic_registers[i++];
-			irq += pin;
-			irq = gsi_irq_sharing(irq);
-			break;
-		}
-		default:
-		{
-			printk(KERN_ERR "unknown bus type %d.\n",bus); 
-			irq = 0;
-			break;
-		}
-	}
-	BUG_ON(irq >= NR_IRQS);
-
-	/*
-	 * PCI IRQ command line redirection. Yes, limits are hardcoded.
-	 */
-	if ((pin >= 16) && (pin <= 23)) {
-		if (pirq_entries[pin-16] != -1) {
-			if (!pirq_entries[pin-16]) {
-				apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
-			} else {
-				irq = pirq_entries[pin-16];
-				apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
-						pin-16, irq);
-			}
-		}
+	if (test_bit(bus, mp_bus_not_pci)) {
+		irq = mp_irqs[idx].mpc_srcbusirq;
+	} else {
+		/*
+		 * PCI IRQs are mapped in order
+		 */
+		i = irq = 0;
+		while (i < apic)
+			irq += nr_ioapic_registers[i++];
+		irq += pin;
 	}
 	BUG_ON(irq >= NR_IRQS);
 	return irq;
@@ -884,43 +678,68 @@ static inline int IO_APIC_irq_trigger(in
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
+static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
 
-int assign_irq_vector(int irq)
+static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
 {
-	unsigned long flags;
 	int vector;
 	struct physdev_irq irq_op;
   
-  	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
+	BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
 
-	spin_lock_irqsave(&vector_lock, flags);
+	cpus_and(*result, mask, cpu_online_map);
 
-  	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
-		spin_unlock_irqrestore(&vector_lock, flags);
-  		return IO_APIC_VECTOR(irq);
-	}
+	if (irq_vector[irq] > 0)
+		return irq_vector[irq];
 
 	irq_op.irq = irq;
-	if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
-		spin_unlock_irqrestore(&vector_lock, flags);
+	if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
 		return -ENOSPC;
-	}
 
 	vector = irq_op.vector;
-	vector_irq[vector] = irq;
-	if (irq != AUTO_ASSIGN)
-		IO_APIC_VECTOR(irq) = vector;
+	irq_vector[irq] = vector;
 
-	spin_unlock_irqrestore(&vector_lock, flags);
+	return vector;
+}
 
+static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
+{
+	int vector;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	vector = __assign_irq_vector(irq, mask, result);
+	spin_unlock_irqrestore(&vector_lock, flags);
 	return vector;
 }
 
-extern void (*interrupt[NR_IRQS])(void);
 #ifndef CONFIG_XEN
-static struct hw_interrupt_type ioapic_level_type;
-static struct hw_interrupt_type ioapic_edge_type;
+void __setup_vector_irq(int cpu)
+{
+	/* Initialize vector_irq on a new cpu */
+	/* This function must be called with vector_lock held */
+	int irq, vector;
+
+	/* Mark the inuse vectors */
+	for (irq = 0; irq < NR_IRQ_VECTORS; ++irq) {
+		if (!cpu_isset(cpu, irq_domain[irq]))
+			continue;
+		vector = irq_vector[irq];
+		per_cpu(vector_irq, cpu)[vector] = irq;
+	}
+	/* Mark the free vectors */
+	for (vector = 0; vector < NR_VECTORS; ++vector) {
+		irq = per_cpu(vector_irq, cpu)[vector];
+		if (irq < 0)
+			continue;
+		if (!cpu_isset(cpu, irq_domain[irq]))
+			per_cpu(vector_irq, cpu)[vector] = -1;
+	}
+}
+
+extern void (*interrupt[NR_IRQS])(void);
+
+static struct irq_chip ioapic_chip;
 
 #define IOAPIC_AUTO	-1
 #define IOAPIC_EDGE	0
@@ -928,16 +747,15 @@ static struct hw_interrupt_type ioapic_e
 
 static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 {
-	unsigned idx;
-
-	idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
-
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 			trigger == IOAPIC_LEVEL)
-		irq_desc[idx].chip = &ioapic_level_type;
-	else
-		irq_desc[idx].chip = &ioapic_edge_type;
-	set_intr_gate(vector, interrupt[idx]);
+		set_irq_chip_and_handler_name(irq, &ioapic_chip,
+					      handle_fasteoi_irq, "fasteoi");
+	else {
+		irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
+		set_irq_chip_and_handler_name(irq, &ioapic_chip,
+					      handle_edge_irq, "edge");
+	}
 }
 #else
 #define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
@@ -990,16 +808,21 @@ static void __init setup_IO_APIC_irqs(vo
 			continue;
 
 		if (IO_APIC_IRQ(irq)) {
-			vector = assign_irq_vector(irq);
+			cpumask_t mask;
+			vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
+			if (vector < 0)
+				continue;
+
+			entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
 			entry.vector = vector;
 
 			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
 			if (!apic && (irq < 16))
 				disable_8259A_irq(irq);
 		}
+		ioapic_write_entry(apic, pin, entry);
+
 		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
-		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
 		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
@@ -1042,7 +865,7 @@ static void __init setup_ExtINT_IRQ0_pin
 	 * The timer IRQ doesn't have to know that behind the
 	 * scene we have a 8259A-master in AEOI mode ...
 	 */
-	irq_desc[0].chip = &ioapic_edge_type;
+	set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
 
 	/*
 	 * Add it to the IO-APIC irq-routing table:
@@ -1138,10 +961,7 @@ void __apicdebuginit print_IO_APIC(void)
 	for (i = 0; i <= reg_01.bits.entries; i++) {
 		struct IO_APIC_route_entry entry;
 
-		spin_lock_irqsave(&ioapic_lock, flags);
-		*(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
-		*(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
+		entry = ioapic_read_entry(apic, i);
 
 		printk(KERN_DEBUG " %02x %03X %02X  ",
 			i,
@@ -1161,17 +981,12 @@ void __apicdebuginit print_IO_APIC(void)
 		);
 	}
 	}
-	if (use_pci_vector())
-		printk(KERN_INFO "Using vector-based indexing\n");
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	for (i = 0; i < NR_IRQS; i++) {
 		struct irq_pin_list *entry = irq_2_pin + i;
 		if (entry->pin < 0)
 			continue;
- 		if (use_pci_vector() && !platform_legacy_irq(i))
-			printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
-		else
-			printk(KERN_DEBUG "IRQ%d ", i);
+		printk(KERN_DEBUG "IRQ%d ", i);
 		for (;;) {
 			printk("-> %d:%d", entry->apic, entry->pin);
 			if (!entry->next)
@@ -1335,9 +1150,6 @@ static void __init enable_IO_APIC(void)
 		irq_2_pin[i].pin = -1;
 		irq_2_pin[i].next = 0;
 	}
-	if (!pirqs_enabled)
-		for (i = 0; i < MAX_PIRQS; i++)
-			pirq_entries[i] = -1;
 
 	/*
 	 * The number of IO-APIC IRQ registers (== #pins):
@@ -1354,11 +1166,7 @@ static void __init enable_IO_APIC(void)
 		/* See if any of the pins is in ExtINT mode */
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
 			struct IO_APIC_route_entry entry;
-			spin_lock_irqsave(&ioapic_lock, flags);
-			*(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
-			*(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
-			spin_unlock_irqrestore(&ioapic_lock, flags);
-
+			entry = ioapic_read_entry(apic, pin);
 
 			/* If the interrupt line is enabled and in ExtInt mode
 			 * I have found the pin where the i8259 is connected.
@@ -1412,7 +1220,6 @@ void disable_IO_APIC(void)
 	 */
 	if (ioapic_i8259.pin != -1) {
 		struct IO_APIC_route_entry entry;
-		unsigned long flags;
 
 		memset(&entry, 0, sizeof(entry));
 		entry.mask            = 0; /* Enabled */
@@ -1429,12 +1236,7 @@ void disable_IO_APIC(void)
 		/*
 		 * Add it to the IO-APIC irq-routing table:
 		 */
-		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
-			*(((int *)&entry)+1));
-		io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
-			*(((int *)&entry)+0));
-		spin_unlock_irqrestore(&ioapic_lock, flags);
+		ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
 	}
 
 	disconnect_bsp_APIC(ioapic_i8259.pin != -1);
@@ -1442,76 +1244,6 @@ void disable_IO_APIC(void)
 }
 
 /*
- * function to set the IO-APIC physical IDs based on the
- * values stored in the MPC table.
- *
- * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
- */
-
-#ifndef CONFIG_XEN
-static void __init setup_ioapic_ids_from_mpc (void)
-{
-	union IO_APIC_reg_00 reg_00;
-	int apic;
-	int i;
-	unsigned char old_id;
-	unsigned long flags;
-
-	/*
-	 * Set the IOAPIC ID to the value stored in the MPC table.
-	 */
-	for (apic = 0; apic < nr_ioapics; apic++) {
-
-		/* Read the register 0 value */
-		spin_lock_irqsave(&ioapic_lock, flags);
-		reg_00.raw = io_apic_read(apic, 0);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-		
-		old_id = mp_ioapics[apic].mpc_apicid;
-
-
-		printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
-
-
-		/*
-		 * We need to adjust the IRQ routing table
-		 * if the ID changed.
-		 */
-		if (old_id != mp_ioapics[apic].mpc_apicid)
-			for (i = 0; i < mp_irq_entries; i++)
-				if (mp_irqs[i].mpc_dstapic == old_id)
-					mp_irqs[i].mpc_dstapic
-						= mp_ioapics[apic].mpc_apicid;
-
-		/*
-		 * Read the right value from the MPC table and
-		 * write it into the ID register.
-	 	 */
-		apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
-				mp_ioapics[apic].mpc_apicid);
-
-		reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
-		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(apic, 0, reg_00.raw);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-
-		/*
-		 * Sanity check
-		 */
-		spin_lock_irqsave(&ioapic_lock, flags);
-		reg_00.raw = io_apic_read(apic, 0);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-		if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
-			printk("could not set ID!\n");
-		else
-			apic_printk(APIC_VERBOSE," ok.\n");
-	}
-}
-#else
-static void __init setup_ioapic_ids_from_mpc(void) { }
-#endif
-
-/*
  * There is a nasty bug in some older SMP boards, their mptable lies
  * about the timer IRQ. We do the following to work around the situation:
  *
@@ -1565,7 +1297,7 @@ static int __init timer_irq_works(void)
  * an edge even if it isn't on the 8259A...
  */
 
-static unsigned int startup_edge_ioapic_irq(unsigned int irq)
+static unsigned int startup_ioapic_irq(unsigned int irq)
 {
 	int was_pending = 0;
 	unsigned long flags;
@@ -1582,107 +1314,19 @@ static unsigned int startup_edge_ioapic_
 	return was_pending;
 }
 
-/*
- * Once we have recorded IRQ_PENDING already, we can mask the
- * interrupt for real. This prevents IRQ storms from unhandled
- * devices.
- */
-static void ack_edge_ioapic_irq(unsigned int irq)
-{
-	move_irq(irq);
-	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
-					== (IRQ_PENDING | IRQ_DISABLED))
-		mask_IO_APIC_irq(irq);
-	ack_APIC_irq();
-}
-
-/*
- * Level triggered interrupts can just be masked,
- * and shutting down and starting up the interrupt
- * is the same as enabling and disabling them -- except
- * with a startup need to return a "was pending" value.
- *
- * Level triggered interrupts are special because we
- * do not touch any IO-APIC register while handling
- * them. We ack the APIC in the end-IRQ handler, not
- * in the start-IRQ-handler. Protection against reentrance
- * from the same interrupt is still provided, both by the
- * generic IRQ layer and by the fact that an unacked local
- * APIC does not accept IRQs.
- */
-static unsigned int startup_level_ioapic_irq (unsigned int irq)
-{
-	unmask_IO_APIC_irq(irq);
-
-	return 0; /* don't check for pending */
-}
-
-static void end_level_ioapic_irq (unsigned int irq)
-{
-	move_irq(irq);
-	ack_APIC_irq();
-}
-
-#ifdef CONFIG_PCI_MSI
-static unsigned int startup_edge_ioapic_vector(unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	return startup_edge_ioapic_irq(irq);
-}
-
-static void ack_edge_ioapic_vector(unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	move_native_irq(vector);
-	ack_edge_ioapic_irq(irq);
-}
-
-static unsigned int startup_level_ioapic_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	return startup_level_ioapic_irq (irq);
-}
-
-static void end_level_ioapic_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	move_native_irq(vector);
-	end_level_ioapic_irq(irq);
-}
-
-static void mask_IO_APIC_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	mask_IO_APIC_irq(irq);
-}
-
-static void unmask_IO_APIC_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	unmask_IO_APIC_irq(irq);
-}
-
-#ifdef CONFIG_SMP
-static void set_ioapic_affinity_vector (unsigned int vector,
-					cpumask_t cpu_mask)
+static int ioapic_retrigger_irq(unsigned int irq)
 {
-	int irq = vector_to_irq(vector);
+	cpumask_t mask;
+	unsigned vector;
+	unsigned long flags;
 
-	set_native_irq_info(vector, cpu_mask);
-	set_ioapic_affinity_irq(irq, cpu_mask);
-}
-#endif // CONFIG_SMP
-#endif // CONFIG_PCI_MSI
+	spin_lock_irqsave(&vector_lock, flags);
+	vector = irq_vector[irq];
+	cpus_clear(mask);
+	cpu_set(first_cpu(irq_domain[irq]), mask);
 
-static int ioapic_retrigger(unsigned int irq)
-{
-	send_IPI_self(IO_APIC_VECTOR(irq));
+	send_IPI_mask(mask, vector);
+	spin_unlock_irqrestore(&vector_lock, flags);
 
 	return 1;
 }
@@ -1696,32 +1340,47 @@ static int ioapic_retrigger(unsigned int
  * races.
  */
 
-static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
-	.typename = "IO-APIC-edge",
-	.startup 	= startup_edge_ioapic,
-	.shutdown 	= shutdown_edge_ioapic,
-	.enable 	= enable_edge_ioapic,
-	.disable 	= disable_edge_ioapic,
-	.ack 		= ack_edge_ioapic,
-	.end 		= end_edge_ioapic,
-#ifdef CONFIG_SMP
-	.set_affinity = set_ioapic_affinity,
+static void ack_apic_edge(unsigned int irq)
+{
+	move_native_irq(irq);
+	ack_APIC_irq();
+}
+
+static void ack_apic_level(unsigned int irq)
+{
+	int do_unmask_irq = 0;
+
+#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+	/* If we are moving the irq we need to mask it */
+	if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+		do_unmask_irq = 1;
+		mask_IO_APIC_irq(irq);
+	}
 #endif
-	.retrigger	= ioapic_retrigger,
-};
 
-static struct hw_interrupt_type ioapic_level_type __read_mostly = {
-	.typename = "IO-APIC-level",
-	.startup 	= startup_level_ioapic,
-	.shutdown 	= shutdown_level_ioapic,
-	.enable 	= enable_level_ioapic,
-	.disable 	= disable_level_ioapic,
-	.ack 		= mask_and_ack_level_ioapic,
-	.end 		= end_level_ioapic,
+	/*
+	 * We must acknowledge the irq before we move it or the acknowledge will
+	 * not propogate properly.
+	 */
+	ack_APIC_irq();
+
+	/* Now we can move and renable the irq */
+	move_masked_irq(irq);
+	if (unlikely(do_unmask_irq))
+		unmask_IO_APIC_irq(irq);
+}
+
+static struct irq_chip ioapic_chip __read_mostly = {
+	.name 		= "IO-APIC",
+	.startup 	= startup_ioapic_irq,
+	.mask	 	= mask_IO_APIC_irq,
+	.unmask	 	= unmask_IO_APIC_irq,
+	.ack 		= ack_apic_edge,
+	.eoi 		= ack_apic_level,
 #ifdef CONFIG_SMP
-	.set_affinity = set_ioapic_affinity,
+	.set_affinity 	= set_ioapic_affinity_irq,
 #endif
-	.retrigger	= ioapic_retrigger,
+	.retrigger	= ioapic_retrigger_irq,
 };
 #endif /* !CONFIG_XEN */
 
@@ -1742,12 +1401,7 @@ static inline void init_IO_APIC_traps(vo
 	 */
 	for (irq = 0; irq < NR_IRQS ; irq++) {
 		int tmp = irq;
-		if (use_pci_vector()) {
-			if (!platform_legacy_irq(tmp))
-				if ((tmp = vector_to_irq(tmp)) == -1)
-					continue;
-		}
-		if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
+		if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
@@ -1758,7 +1412,7 @@ static inline void init_IO_APIC_traps(vo
 #ifndef CONFIG_XEN
 			else
 				/* Strange. Oh, well.. */
-				irq_desc[irq].chip = &no_irq_type;
+				irq_desc[irq].chip = &no_irq_chip;
 #endif
 		}
 	}
@@ -1879,8 +1533,6 @@ static inline void unlock_ExtINT_logic(v
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-int timer_uses_ioapic_pin_0;
-
 /*
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
@@ -1893,13 +1545,13 @@ static inline void check_timer(void)
 {
 	int apic1, pin1, apic2, pin2;
 	int vector;
+	cpumask_t mask;
 
 	/*
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	vector = assign_irq_vector(0);
-	set_intr_gate(vector, interrupt[0]);
+	vector = assign_irq_vector(0, TARGET_CPUS, &mask);
 
 	/*
 	 * Subtle, code in do_timer_interrupt() expects an AEOI
@@ -1918,9 +1570,6 @@ static inline void check_timer(void)
 	pin2  = ioapic_i8259.pin;
 	apic2 = ioapic_i8259.apic;
 
-	if (pin1 == 0)
-		timer_uses_ioapic_pin_0 = 1;
-
 	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		vector, apic1, pin1, apic2, pin2);
 
@@ -2035,11 +1684,6 @@ void __init setup_IO_APIC(void)
 
 	apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
 
-	/*
-	 * Set up the IO-APIC IRQ routing table.
-	 */
-	if (!acpi_ioapic)
-		setup_ioapic_ids_from_mpc();
 #ifndef CONFIG_XEN
 	sync_Arb_IDs();
 #endif /* !CONFIG_XEN */
@@ -2060,17 +1704,12 @@ static int ioapic_suspend(struct sys_dev
 {
 	struct IO_APIC_route_entry *entry;
 	struct sysfs_ioapic_data *data;
-	unsigned long flags;
 	int i;
 
 	data = container_of(dev, struct sysfs_ioapic_data, dev);
 	entry = data->entry;
-	spin_lock_irqsave(&ioapic_lock, flags);
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
-		*(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
-		*(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
-	}
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
+		*entry = ioapic_read_entry(dev->id, i);
 
 	return 0;
 }
@@ -2092,11 +1731,9 @@ static int ioapic_resume(struct sys_devi
 		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
-		io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
-		io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
-	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
+		ioapic_write_entry(dev->id, i, entry[i]);
 
 	return 0;
 }
@@ -2142,26 +1779,254 @@ static int __init ioapic_init_sysfs(void
 
 device_initcall(ioapic_init_sysfs);
 
-/* --------------------------------------------------------------------------
-                          ACPI-based IOAPIC Configuration
-   -------------------------------------------------------------------------- */
+#ifndef CONFIG_XEN
+/*
+ * Dynamic irq allocate and deallocation
+ */
+int create_irq(void)
+{
+	/* Allocate an unused irq */
+	int irq;
+	int new;
+	int vector = 0;
+	unsigned long flags;
+	cpumask_t mask;
 
-#ifdef CONFIG_ACPI
+	irq = -ENOSPC;
+	spin_lock_irqsave(&vector_lock, flags);
+	for (new = (NR_IRQS - 1); new >= 0; new--) {
+		if (platform_legacy_irq(new))
+			continue;
+		if (irq_vector[new] != 0)
+			continue;
+		vector = __assign_irq_vector(new, TARGET_CPUS, &mask);
+		if (likely(vector > 0))
+			irq = new;
+		break;
+	}
+	spin_unlock_irqrestore(&vector_lock, flags);
 
-#define IO_APIC_MAX_ID		0xFE
+	if (irq >= 0) {
+		dynamic_irq_init(irq);
+	}
+	return irq;
+}
 
-int __init io_apic_get_version (int ioapic)
+void destroy_irq(unsigned int irq)
 {
-	union IO_APIC_reg_01	reg_01;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ioapic_lock, flags);
-	reg_01.raw = io_apic_read(ioapic, 1);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	dynamic_irq_cleanup(irq);
+
+	spin_lock_irqsave(&vector_lock, flags);
+	irq_vector[irq] = 0;
+	spin_unlock_irqrestore(&vector_lock, flags);
+}
+#endif
+
+/*
+ * MSI mesage composition
+ */
+#ifdef CONFIG_PCI_MSI
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+	int vector;
+	unsigned dest;
+	cpumask_t tmp;
+
+	vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
+	if (vector >= 0) {
+		dest = cpu_mask_to_apicid(tmp);
+
+		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->address_lo =
+			MSI_ADDR_BASE_LO |
+			((INT_DEST_MODE == 0) ?
+				MSI_ADDR_DEST_MODE_PHYSICAL:
+				MSI_ADDR_DEST_MODE_LOGICAL) |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				MSI_ADDR_REDIRECTION_CPU:
+				MSI_ADDR_REDIRECTION_LOWPRI) |
+			MSI_ADDR_DEST_ID(dest);
+
+		msg->data =
+			MSI_DATA_TRIGGER_EDGE |
+			MSI_DATA_LEVEL_ASSERT |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				MSI_DATA_DELIVERY_FIXED:
+				MSI_DATA_DELIVERY_LOWPRI) |
+			MSI_DATA_VECTOR(vector);
+	}
+	return vector;
+}
+
+#ifdef CONFIG_SMP
+static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct msi_msg msg;
+	unsigned int dest;
+	cpumask_t tmp;
+	int vector;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(mask, tmp, CPU_MASK_ALL);
+
+	vector = assign_irq_vector(irq, mask, &tmp);
+	if (vector < 0)
+		return;
+
+	dest = cpu_mask_to_apicid(tmp);
+
+	read_msi_msg(irq, &msg);
+
+	msg.data &= ~MSI_DATA_VECTOR_MASK;
+	msg.data |= MSI_DATA_VECTOR(vector);
+	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+	write_msi_msg(irq, &msg);
+	set_native_irq_info(irq, mask);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+	.name		= "PCI-MSI",
+	.unmask		= unmask_msi_irq,
+	.mask		= mask_msi_irq,
+	.ack		= ack_apic_edge,
+#ifdef CONFIG_SMP
+	.set_affinity	= set_msi_irq_affinity,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
+
+int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
+{
+	struct msi_msg msg;
+	int ret;
+	ret = msi_compose_msg(dev, irq, &msg);
+	if (ret < 0)
+		return ret;
+
+	write_msi_msg(irq, &msg);
+
+	set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+
+	return 0;
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+	return;
+}
+
+#endif /* CONFIG_PCI_MSI */
+
+/*
+ * Hypertransport interrupt support
+ */
+#ifdef CONFIG_HT_IRQ
+
+#ifdef CONFIG_SMP
+
+static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+	struct ht_irq_msg msg;
+	fetch_ht_irq_msg(irq, &msg);
+
+	msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
+	msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
 
-	return reg_01.bits.version;
+	msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
+	msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+
+	write_ht_irq_msg(irq, &msg);
 }
 
+static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	unsigned int dest;
+	cpumask_t tmp;
+	int vector;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(mask, tmp, CPU_MASK_ALL);
+
+	vector = assign_irq_vector(irq, mask, &tmp);
+	if (vector < 0)
+		return;
+
+	dest = cpu_mask_to_apicid(tmp);
+
+	target_ht_irq(irq, dest, vector);
+	set_native_irq_info(irq, mask);
+}
+#endif
+
+static struct irq_chip ht_irq_chip = {
+	.name		= "PCI-HT",
+	.mask		= mask_ht_irq,
+	.unmask		= unmask_ht_irq,
+	.ack		= ack_apic_edge,
+#ifdef CONFIG_SMP
+	.set_affinity	= set_ht_irq_affinity,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
+
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+	int vector;
+	cpumask_t tmp;
+
+	vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
+	if (vector >= 0) {
+		struct ht_irq_msg msg;
+		unsigned dest;
+
+		dest = cpu_mask_to_apicid(tmp);
+
+		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+		msg.address_lo =
+			HT_IRQ_LOW_BASE |
+			HT_IRQ_LOW_DEST_ID(dest) |
+			HT_IRQ_LOW_VECTOR(vector) |
+			((INT_DEST_MODE == 0) ?
+				HT_IRQ_LOW_DM_PHYSICAL :
+				HT_IRQ_LOW_DM_LOGICAL) |
+			HT_IRQ_LOW_RQEOI_EDGE |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				HT_IRQ_LOW_MT_FIXED :
+				HT_IRQ_LOW_MT_ARBITRATED) |
+			HT_IRQ_LOW_IRQ_MASKED;
+
+		write_ht_irq_msg(irq, &msg);
+
+		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
+					      handle_edge_irq, "edge");
+	}
+	return vector;
+}
+#endif /* CONFIG_HT_IRQ */
+
+/* --------------------------------------------------------------------------
+                          ACPI-based IOAPIC Configuration
+   -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI
+
+#define IO_APIC_MAX_ID		0xFE
 
 int __init io_apic_get_redir_entries (int ioapic)
 {
@@ -2180,6 +2045,8 @@ int io_apic_set_pci_routing (int ioapic,
 {
 	struct IO_APIC_route_entry entry;
 	unsigned long flags;
+	int vector;
+	cpumask_t mask;
 
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
@@ -2188,6 +2055,17 @@ int io_apic_set_pci_routing (int ioapic,
 	}
 
 	/*
+	 * IRQs < 16 are already in the irq_2_pin[] map
+	 */
+	if (irq >= 16)
+		add_pin_to_irq(irq, ioapic, pin);
+
+
+	vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
+	if (vector < 0)
+		return vector;
+
+	/*
 	 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
 	 * Note that we mask (disable) IRQs now -- these get enabled when the
 	 * corresponding device driver registers for this IRQ.
@@ -2197,19 +2075,11 @@ int io_apic_set_pci_routing (int ioapic,
 
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.dest_mode = INT_DEST_MODE;
-	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+	entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
 	entry.trigger = edge_level;
 	entry.polarity = active_high_low;
 	entry.mask = 1;					 /* Disabled (masked) */
-
-	irq = gsi_irq_sharing(irq);
-	/*
-	 * IRQs < 16 are already in the irq_2_pin[] map
-	 */
-	if (irq >= 16)
-		add_pin_to_irq(irq, ioapic, pin);
-
-	entry.vector = assign_irq_vector(irq);
+	entry.vector = vector & 0xff;
 
 	apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
 		"IRQ %d Mode:%i Active:%i)\n", ioapic, 
@@ -2221,10 +2091,10 @@ int io_apic_set_pci_routing (int ioapic,
 	if (!ioapic && (irq < 16))
 		disable_8259A_irq(irq);
 
+	ioapic_write_entry(ioapic, pin, entry);
+
 	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
-	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
-	set_native_irq_info(use_pci_vector() ?  entry.vector : irq, TARGET_CPUS);
+	set_native_irq_info(irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
Index: head-2008-07-15/arch/x86/kernel/ioport_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/ioport_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/ioport_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -58,6 +58,7 @@ asmlinkage long sys_ioperm(unsigned long
 
 		memset(bitmap, 0xff, IO_BITMAP_BYTES);
 		t->io_bitmap_ptr = bitmap;
+		set_thread_flag(TIF_IO_BITMAP);
 
 		set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
 		set_iobitmap.nr_ports = IO_BITMAP_BITS;
Index: head-2008-07-15/arch/x86/kernel/irq_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/irq_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/irq_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -20,11 +20,6 @@
 #include <asm/idle.h>
 
 atomic_t irq_err_count;
-#ifdef CONFIG_X86_IO_APIC
-#ifdef APIC_MISMATCH_DEBUG
-atomic_t irq_mis_count;
-#endif
-#endif
 
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 /*
@@ -79,7 +74,8 @@ int show_interrupts(struct seq_file *p, 
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].chip->typename);
+		seq_printf(p, " %8s", irq_desc[i].chip->name);
+		seq_printf(p, "-%-8s", irq_desc[i].name);
 
 		seq_printf(p, "  %s", action->name);
 		for (action=action->next; action; action = action->next)
@@ -99,11 +95,6 @@ skip:
 		seq_putc(p, '\n');
 #endif
 		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-#ifdef CONFIG_X86_IO_APIC
-#ifdef APIC_MISMATCH_DEBUG
-		seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
-#endif
-#endif
 	}
 	return 0;
 }
@@ -114,24 +105,28 @@ skip:
  * handlers).
  */
 asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
-{	
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
 	/* high bit used in ret_from_ code  */
 	unsigned irq = ~regs->orig_rax;
 
-	if (unlikely(irq >= NR_IRQS)) {
-		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
-					__FUNCTION__, irq);
-		BUG();
-	}
-
 	exit_idle();
 	irq_enter();
+
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	stack_overflow_check(regs);
 #endif
-	__do_IRQ(irq, regs);
+
+	if (likely(irq < NR_IRQS))
+		generic_handle_irq(irq);
+	else
+		printk(KERN_EMERG "%s: %d.%d No irq handler for irq\n",
+			__func__, smp_processor_id(), irq);
+
 	irq_exit();
 
+	set_irq_regs(old_regs);
 	return 1;
 }
 
@@ -192,6 +187,6 @@ EXPORT_SYMBOL(do_softirq);
  */
 void ack_bad_irq(unsigned int irq)
 {
-        printk("unexpected IRQ trap at vector %02x\n", irq);
+        printk("unexpected IRQ trap at irq %02x\n", irq);
 }
 #endif
Index: head-2008-07-15/arch/x86/kernel/mpparse_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/mpparse_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/mpparse_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -41,8 +41,7 @@ int acpi_found_madt;
  * Various Linux-internal data structures created from the
  * MP-table.
  */
-unsigned char apic_version [MAX_APICS];
-unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
 
 static int mp_current_pci_id = 0;
@@ -56,7 +55,6 @@ struct mpc_config_intsrc mp_irqs[MAX_IRQ
 int mp_irq_entries;
 
 int nr_ioapics;
-int pic_mode;
 unsigned long mp_lapic_addr = 0;
 
 
@@ -71,19 +69,6 @@ unsigned disabled_cpus __initdata;
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
 
-/* ACPI MADT entry parsing functions */
-#ifdef CONFIG_ACPI
-extern struct acpi_boot_flags acpi_boot;
-#ifdef CONFIG_X86_LOCAL_APIC
-extern int acpi_parse_lapic (acpi_table_entry_header *header);
-extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
-extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
-#endif /*CONFIG_X86_LOCAL_APIC*/
-#ifdef CONFIG_X86_IO_APIC
-extern int acpi_parse_ioapic (acpi_table_entry_header *header);
-#endif /*CONFIG_X86_IO_APIC*/
-#endif /*CONFIG_ACPI*/
-
 u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
 
 
@@ -109,24 +94,20 @@ static int __init mpf_checksum(unsigned 
 static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
 {
 	int cpu;
-	unsigned char ver;
 	cpumask_t tmp_map;
+	char *bootup_cpu = "";
 
 	if (!(m->mpc_cpuflag & CPU_ENABLED)) {
 		disabled_cpus++;
 		return;
 	}
-
-	printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
-		m->mpc_apicid,
-	       (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
-	       (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
-		m->mpc_apicver);
-
 	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
-		Dprintk("    Bootup CPU\n");
+		bootup_cpu = " (Bootup-CPU)";
 		boot_cpu_id = m->mpc_apicid;
 	}
+
+	printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
+
 	if (num_processors >= NR_CPUS) {
 		printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
 			" Processor ignored.\n", NR_CPUS);
@@ -137,24 +118,7 @@ static void __cpuinit MP_processor_info 
 	cpus_complement(tmp_map, cpu_present_map);
 	cpu = first_cpu(tmp_map);
 
-#if MAX_APICS < 255	
-	if ((int)m->mpc_apicid > MAX_APICS) {
-		printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
-			m->mpc_apicid, MAX_APICS);
-		return;
-	}
-#endif
-	ver = m->mpc_apicver;
-
 	physid_set(m->mpc_apicid, phys_cpu_present_map);
-	/*
-	 * Validate version
-	 */
-	if (ver == 0x0) {
-		printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
-		ver = 0x10;
-	}
-	apic_version[m->mpc_apicid] = ver;
  	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
  		/*
  		 * bios_cpu_apicid is required to have processors listed
@@ -185,37 +149,42 @@ static void __init MP_bus_info (struct m
 	Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
 
 	if (strncmp(str, "ISA", 3) == 0) {
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
-	} else if (strncmp(str, "EISA", 4) == 0) {
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+		set_bit(m->mpc_busid, mp_bus_not_pci);
 	} else if (strncmp(str, "PCI", 3) == 0) {
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+		clear_bit(m->mpc_busid, mp_bus_not_pci);
 		mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
 		mp_current_pci_id++;
-	} else if (strncmp(str, "MCA", 3) == 0) {
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
 	} else {
 		printk(KERN_ERR "Unknown bustype %s\n", str);
 	}
 }
 
+static int bad_ioapic(unsigned long address)
+{
+	if (nr_ioapics >= MAX_IO_APICS) {
+		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+			"(found %d)\n", MAX_IO_APICS, nr_ioapics);
+		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+	}
+	if (!address) {
+		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+			" found in table, skipping!\n");
+		return 1;
+	}
+	return 0;
+}
+
 static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
 {
 	if (!(m->mpc_flags & MPC_APIC_USABLE))
 		return;
 
-	printk("I/O APIC #%d Version %d at 0x%X.\n",
-		m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
-	if (nr_ioapics >= MAX_IO_APICS) {
-		printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
-			MAX_IO_APICS, nr_ioapics);
-		panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
-	}
-	if (!m->mpc_apicaddr) {
-		printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
-			" found in MP table, skipping!\n");
+	printk("I/O APIC #%d at 0x%X.\n",
+		m->mpc_apicid, m->mpc_apicaddr);
+
+	if (bad_ioapic(m->mpc_apicaddr))
 		return;
-	}
+
 	mp_ioapics[nr_ioapics] = *m;
 	nr_ioapics++;
 }
@@ -239,19 +208,6 @@ static void __init MP_lintsrc_info (stru
 			m->mpc_irqtype, m->mpc_irqflag & 3,
 			(m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
 			m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
-	/*
-	 * Well it seems all SMP boards in existence
-	 * use ExtINT/LVT1 == LINT0 and
-	 * NMI/LVT2 == LINT1 - the following check
-	 * will show us if this assumptions is false.
-	 * Until then we do not have to add baggage.
-	 */
-	if ((m->mpc_irqtype == mp_ExtINT) &&
-		(m->mpc_destapiclint != 0))
-			BUG();
-	if ((m->mpc_irqtype == mp_NMI) &&
-		(m->mpc_destapiclint != 1))
-			BUG();
 }
 
 /*
@@ -265,7 +221,7 @@ static int __init smp_read_mpc(struct mp
 	unsigned char *mpt=((unsigned char *)mpc)+count;
 
 	if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
-		printk("SMP mptable: bad signature [%c%c%c%c]!\n",
+		printk("MPTABLE: bad signature [%c%c%c%c]!\n",
 			mpc->mpc_signature[0],
 			mpc->mpc_signature[1],
 			mpc->mpc_signature[2],
@@ -273,31 +229,31 @@ static int __init smp_read_mpc(struct mp
 		return 0;
 	}
 	if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
-		printk("SMP mptable: checksum error!\n");
+		printk("MPTABLE: checksum error!\n");
 		return 0;
 	}
 	if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
-		printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
+		printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
 			mpc->mpc_spec);
 		return 0;
 	}
 	if (!mpc->mpc_lapic) {
-		printk(KERN_ERR "SMP mptable: null local APIC address!\n");
+		printk(KERN_ERR "MPTABLE: null local APIC address!\n");
 		return 0;
 	}
 	memcpy(str,mpc->mpc_oem,8);
-	str[8]=0;
-	printk(KERN_INFO "OEM ID: %s ",str);
+	str[8] = 0;
+	printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
 
 	memcpy(str,mpc->mpc_productid,12);
-	str[12]=0;
-	printk("Product ID: %s ",str);
+	str[12] = 0;
+	printk("MPTABLE: Product ID: %s ",str);
 
-	printk("APIC at: 0x%X\n",mpc->mpc_lapic);
+	printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
 
 	/* save the local APIC address, it might be non-default */
 	if (!acpi_lapic)
-	mp_lapic_addr = mpc->mpc_lapic;
+		mp_lapic_addr = mpc->mpc_lapic;
 
 	/*
 	 *	Now process the configuration blocks.
@@ -309,7 +265,7 @@ static int __init smp_read_mpc(struct mp
 				struct mpc_config_processor *m=
 					(struct mpc_config_processor *)mpt;
 				if (!acpi_lapic)
-				MP_processor_info(m);
+					MP_processor_info(m);
 				mpt += sizeof(*m);
 				count += sizeof(*m);
 				break;
@@ -328,8 +284,8 @@ static int __init smp_read_mpc(struct mp
 				struct mpc_config_ioapic *m=
 					(struct mpc_config_ioapic *)mpt;
 				MP_ioapic_info(m);
-				mpt+=sizeof(*m);
-				count+=sizeof(*m);
+				mpt += sizeof(*m);
+				count += sizeof(*m);
 				break;
 			}
 			case MP_INTSRC:
@@ -338,8 +294,8 @@ static int __init smp_read_mpc(struct mp
 					(struct mpc_config_intsrc *)mpt;
 
 				MP_intsrc_info(m);
-				mpt+=sizeof(*m);
-				count+=sizeof(*m);
+				mpt += sizeof(*m);
+				count += sizeof(*m);
 				break;
 			}
 			case MP_LINTSRC:
@@ -347,15 +303,15 @@ static int __init smp_read_mpc(struct mp
 				struct mpc_config_lintsrc *m=
 					(struct mpc_config_lintsrc *)mpt;
 				MP_lintsrc_info(m);
-				mpt+=sizeof(*m);
-				count+=sizeof(*m);
+				mpt += sizeof(*m);
+				count += sizeof(*m);
 				break;
 			}
 		}
 	}
 	clustered_apic_check();
 	if (!num_processors)
-		printk(KERN_ERR "SMP mptable: no processors registered!\n");
+		printk(KERN_ERR "MPTABLE: no processors registered!\n");
 	return num_processors;
 }
 
@@ -451,13 +407,10 @@ static inline void __init construct_defa
 	 * 2 CPUs, numbered 0 & 1.
 	 */
 	processor.mpc_type = MP_PROCESSOR;
-	/* Either an integrated APIC or a discrete 82489DX. */
-	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	processor.mpc_apicver = 0;
 	processor.mpc_cpuflag = CPU_ENABLED;
-	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
-				   (boot_cpu_data.x86_model << 4) |
-				   boot_cpu_data.x86_mask;
-	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+	processor.mpc_cpufeature = 0;
+	processor.mpc_featureflag = 0;
 	processor.mpc_reserved[0] = 0;
 	processor.mpc_reserved[1] = 0;
 	for (i = 0; i < 2; i++) {
@@ -476,14 +429,6 @@ static inline void __init construct_defa
 		case 5:
 			memcpy(bus.mpc_bustype, "ISA   ", 6);
 			break;
-		case 2:
-		case 6:
-		case 3:
-			memcpy(bus.mpc_bustype, "EISA  ", 6);
-			break;
-		case 4:
-		case 7:
-			memcpy(bus.mpc_bustype, "MCA   ", 6);
 	}
 	MP_bus_info(&bus);
 	if (mpc_default_type > 4) {
@@ -494,7 +439,7 @@ static inline void __init construct_defa
 
 	ioapic.mpc_type = MP_IOAPIC;
 	ioapic.mpc_apicid = 2;
-	ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	ioapic.mpc_apicver = 0;
 	ioapic.mpc_flags = MPC_APIC_USABLE;
 	ioapic.mpc_apicaddr = 0xFEC00000;
 	MP_ioapic_info(&ioapic);
@@ -537,13 +482,6 @@ void __init get_smp_config (void)
  		printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
 
 	printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
-	if (mpf->mpf_feature2 & (1<<7)) {
-		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
-		pic_mode = 1;
-	} else {
-		printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
-		pic_mode = 0;
-	}
 
 	/*
 	 * Now see if we need to read further.
@@ -620,7 +558,7 @@ static int __init smp_scan_config (unsig
 	return 0;
 }
 
-void __init find_intel_smp (void)
+void __init find_smp_config(void)
 {
 	unsigned int address;
 
@@ -637,9 +575,7 @@ void __init find_intel_smp (void)
 			smp_scan_config(0xF0000,0x10000))
 		return;
 	/*
-	 * If it is an SMP machine we should know now, unless the
-	 * configuration is in an EISA/MCA bus machine with an
-	 * extended bios data area.
+	 * If it is an SMP machine we should know now.
 	 *
 	 * there is a real-mode segmented pointer pointing to the
 	 * 4K EBDA area at 0x40E, calculate and scan it here.
@@ -660,64 +596,38 @@ void __init find_intel_smp (void)
 	 printk(KERN_INFO "No mptable found.\n");
 }
 
-/*
- * - Intel MP Configuration Table
- */
-void __init find_smp_config (void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	find_intel_smp();
-#endif
-}
-
-
 /* --------------------------------------------------------------------------
                             ACPI-based MP Configuration
    -------------------------------------------------------------------------- */
 
 #ifdef CONFIG_ACPI
 
-void __init mp_register_lapic_address (
-	u64			address)
+void __init mp_register_lapic_address(u64 address)
 {
 #ifndef CONFIG_XEN
 	mp_lapic_addr = (unsigned long) address;
-
 	set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-
 	if (boot_cpu_id == -1U)
 		boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
-
-	Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
 #endif
 }
 
-
-void __cpuinit mp_register_lapic (
-	u8			id, 
-	u8			enabled)
+void __cpuinit mp_register_lapic (u8 id, u8 enabled)
 {
 	struct mpc_config_processor processor;
 	int			boot_cpu = 0;
 	
-	if (id >= MAX_APICS) {
-		printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
-			id, MAX_APICS);
-		return;
-	}
-
-	if (id == boot_cpu_physical_apicid)
+	if (id == boot_cpu_id)
 		boot_cpu = 1;
 
 #ifndef CONFIG_XEN
 	processor.mpc_type = MP_PROCESSOR;
 	processor.mpc_apicid = id;
-	processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
+	processor.mpc_apicver = 0;
 	processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
 	processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
-	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
-		(boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+	processor.mpc_cpufeature = 0;
+	processor.mpc_featureflag = 0;
 	processor.mpc_reserved[0] = 0;
 	processor.mpc_reserved[1] = 0;
 #endif
@@ -725,8 +635,6 @@ void __cpuinit mp_register_lapic (
 	MP_processor_info(&processor);
 }
 
-#ifdef CONFIG_X86_IO_APIC
-
 #define MP_ISA_BUS		0
 #define MP_MAX_IOAPIC_PIN	127
 
@@ -737,11 +645,9 @@ static struct mp_ioapic_routing {
 	u32			pin_programmed[4];
 } mp_ioapic_routing[MAX_IO_APICS];
 
-
-static int mp_find_ioapic (
-	int			gsi)
+static int mp_find_ioapic(int gsi)
 {
-	int			i = 0;
+	int i = 0;
 
 	/* Find the IOAPIC that manages this GSI. */
 	for (i = 0; i < nr_ioapics; i++) {
@@ -751,28 +657,15 @@ static int mp_find_ioapic (
 	}
 
 	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
-
 	return -1;
 }
-	
 
-void __init mp_register_ioapic (
-	u8			id, 
-	u32			address,
-	u32			gsi_base)
+void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
 {
-	int			idx = 0;
+	int idx = 0;
 
-	if (nr_ioapics >= MAX_IO_APICS) {
-		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-			"(found %d)\n", MAX_IO_APICS, nr_ioapics);
-		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-	}
-	if (!address) {
-		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-			" found in MADT table, skipping!\n");
+	if (bad_ioapic(address))
 		return;
-	}
 
 	idx = nr_ioapics++;
 
@@ -784,7 +677,7 @@ void __init mp_register_ioapic (
 	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
 #endif
 	mp_ioapics[idx].mpc_apicid = id;
-	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+	mp_ioapics[idx].mpc_apicver = 0;
 	
 	/* 
 	 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
@@ -795,21 +688,15 @@ void __init mp_register_ioapic (
 	mp_ioapic_routing[idx].gsi_end = gsi_base + 
 		io_apic_get_redir_entries(idx);
 
-	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
 		"GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
-		mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+		mp_ioapics[idx].mpc_apicaddr,
 		mp_ioapic_routing[idx].gsi_start,
 		mp_ioapic_routing[idx].gsi_end);
-
-	return;
 }
 
-
-void __init mp_override_legacy_irq (
-	u8			bus_irq,
-	u8			polarity, 
-	u8			trigger, 
-	u32			gsi)
+void __init
+mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32	gsi)
 {
 	struct mpc_config_intsrc intsrc;
 	int			ioapic = -1;
@@ -847,22 +734,18 @@ void __init mp_override_legacy_irq (
 	mp_irqs[mp_irq_entries] = intsrc;
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!\n");
-
-	return;
 }
 
-
-void __init mp_config_acpi_legacy_irqs (void)
+void __init mp_config_acpi_legacy_irqs(void)
 {
 	struct mpc_config_intsrc intsrc;
-	int			i = 0;
-	int			ioapic = -1;
+	int i = 0;
+	int ioapic = -1;
 
 	/* 
 	 * Fabricate the legacy ISA bus (bus #31).
 	 */
-	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
-	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+	set_bit(MP_ISA_BUS, mp_bus_not_pci);
 
 	/* 
 	 * Locate the IOAPIC that manages the ISA IRQs (0-15). 
@@ -915,24 +798,13 @@ void __init mp_config_acpi_legacy_irqs (
 		if (++mp_irq_entries == MAX_IRQ_SOURCES)
 			panic("Max # of irq sources exceeded!\n");
 	}
-
-	return;
 }
 
-#define MAX_GSI_NUM	4096
-
 int mp_register_gsi(u32 gsi, int triggering, int polarity)
 {
-	int			ioapic = -1;
-	int			ioapic_pin = 0;
-	int			idx, bit = 0;
-	static int		pci_irq = 16;
-	/*
-	 * Mapping between Global System Interrupts, which
-	 * represent all possible interrupts, to the IRQs
-	 * assigned to actual devices.
-	 */
-	static int		gsi_to_irq[MAX_GSI_NUM];
+	int ioapic = -1;
+	int ioapic_pin = 0;
+	int idx, bit = 0;
 
 	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
 		return gsi;
@@ -965,47 +837,14 @@ int mp_register_gsi(u32 gsi, int trigger
 	if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
 		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
 			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-		return gsi_to_irq[gsi];
+		return gsi;
 	}
 
 	mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
 
-	if (triggering == ACPI_LEVEL_SENSITIVE) {
-		/*
-		 * For PCI devices assign IRQs in order, avoiding gaps
-		 * due to unused I/O APIC pins.
-		 */
-		int irq = gsi;
-		if (gsi < MAX_GSI_NUM) {
-			/*
-			 * Retain the VIA chipset work-around (gsi > 15), but
-			 * avoid a problem where the 8254 timer (IRQ0) is setup
-			 * via an override (so it's not on pin 0 of the ioapic),
-			 * and at the same time, the pin 0 interrupt is a PCI
-			 * type.  The gsi > 15 test could cause these two pins
-			 * to be shared as IRQ0, and they are not shareable.
-			 * So test for this condition, and if necessary, avoid
-			 * the pin collision.
-			 */
-			if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
-				gsi = pci_irq++;
-			/*
-			 * Don't assign IRQ used by ACPI SCI
-			 */
-			if (gsi == acpi_fadt.sci_int)
-				gsi = pci_irq++;
-			gsi_to_irq[irq] = gsi;
-		} else {
-			printk(KERN_ERR "GSI %u is too high\n", gsi);
-			return gsi;
-		}
-	}
-
 	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
 		triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
 		polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
 	return gsi;
 }
-
-#endif /*CONFIG_X86_IO_APIC*/
 #endif /*CONFIG_ACPI*/
Index: head-2008-07-15/arch/x86/kernel/process_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/process_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/process_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -89,25 +89,24 @@ void idle_notifier_unregister(struct not
 }
 EXPORT_SYMBOL(idle_notifier_unregister);
 
-enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
-static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
-
 void enter_idle(void)
 {
-	__get_cpu_var(idle_state) = CPU_IDLE;
+	write_pda(isidle, 1);
 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
 }
 
 static void __exit_idle(void)
 {
-	__get_cpu_var(idle_state) = CPU_NOT_IDLE;
+	if (test_and_clear_bit_pda(0, isidle) == 0)
+		return;
 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
 }
 
 /* Called from interrupts to signify idle end */
 void exit_idle(void)
 {
-	if (current->pid | read_pda(irqcount))
+	/* idle loop has pid 0 */
+	if (current->pid)
 		return;
 	__exit_idle();
 }
@@ -184,6 +183,9 @@ void cpu_idle (void)
 				play_dead();
 			enter_idle();
 			idle();
+			/* In many cases the interrupt that ended idle
+			   has already called exit_idle. But some idle
+			   loops can be woken up without interrupt. */
 			__exit_idle();
 		}
 
@@ -196,7 +198,7 @@ void cpu_idle (void)
 void cpu_idle_wait(void)
 {
 	unsigned int cpu, this_cpu = get_cpu();
-	cpumask_t map;
+	cpumask_t map, tmp = current->cpus_allowed;
 
 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
 	put_cpu();
@@ -219,6 +221,8 @@ void cpu_idle_wait(void)
 		}
 		cpus_and(map, map, cpu_online_map);
 	} while (!cpus_empty(map));
+
+	set_cpus_allowed(current, tmp);
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
@@ -250,9 +254,9 @@ void __show_regs(struct pt_regs * regs)
 	print_modules();
 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
 		current->pid, current->comm, print_tainted(),
-		system_utsname.release,
-		(int)strcspn(system_utsname.version, " "),
-		system_utsname.version);
+		init_utsname()->release,
+		(int)strcspn(init_utsname()->version, " "),
+		init_utsname()->version);
 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
 	printk_address(regs->rip); 
 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
@@ -310,6 +314,7 @@ void exit_thread(void)
 
 		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
+		clear_thread_flag(TIF_IO_BITMAP);
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
@@ -340,6 +345,7 @@ void flush_thread(void)
 		if (t->flags & _TIF_IA32)
 			current_thread_info()->status |= TS_COMPAT;
 	}
+	t->flags &= ~_TIF_DEBUG;
 
 	tsk->thread.debugreg0 = 0;
 	tsk->thread.debugreg1 = 0;
@@ -432,7 +438,7 @@ int copy_thread(int nr, unsigned long cl
 	asm("mov %%es,%0" : "=m" (p->thread.es));
 	asm("mov %%ds,%0" : "=m" (p->thread.ds));
 
-	if (unlikely(me->thread.io_bitmap_ptr != NULL)) { 
+	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
 			p->thread.io_bitmap_max = 0;
@@ -440,6 +446,7 @@ int copy_thread(int nr, unsigned long cl
 		}
 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
 				IO_BITMAP_BYTES);
+		set_tsk_thread_flag(p, TIF_IO_BITMAP);
 	} 
 
 	/*
@@ -474,6 +481,30 @@ static inline void __save_init_fpu( stru
 }
 
 /*
+ * This special macro can be used to load a debugging register
+ */
+#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
+
+static inline void __switch_to_xtra(struct task_struct *prev_p,
+			     	    struct task_struct *next_p)
+{
+	struct thread_struct *prev, *next;
+
+	prev = &prev_p->thread,
+	next = &next_p->thread;
+
+	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+		loaddebug(next, 0);
+		loaddebug(next, 1);
+		loaddebug(next, 2);
+		loaddebug(next, 3);
+		/* no 4 and 5 */
+		loaddebug(next, 6);
+		loaddebug(next, 7);
+	}
+}
+
+/*
  *	switch_to(x,y) should switch tasks from x to y.
  *
  * This could still be optimized: 
@@ -495,6 +526,10 @@ __switch_to(struct task_struct *prev_p, 
 	struct physdev_set_iobitmap iobmp_op;
 	multicall_entry_t _mcl[8], *mcl = _mcl;
 
+	/* we're going to use this soon, after a few expensive things */
+	if (next_p->fpu_counter>5)
+		prefetch(&next->i387.fxsave);
+
 	/*
 	 * This is basically '__unlazy_fpu', except that we queue a
 	 * multicall to indicate FPU task switch, rather than
@@ -507,7 +542,8 @@ __switch_to(struct task_struct *prev_p, 
 		mcl->op      = __HYPERVISOR_fpu_taskswitch;
 		mcl->args[0] = 1;
 		mcl++;
-	}
+	} else
+		prev_p->fpu_counter = 0;
 
 	/*
 	 * Reload esp0, LDT and the page table pointer:
@@ -587,21 +623,29 @@ __switch_to(struct task_struct *prev_p, 
 	write_pda(oldrsp, next->userrsp); 
 	write_pda(pcurrent, next_p); 
 	write_pda(kernelstack,
-		  task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+#ifdef CONFIG_CC_STACKPROTECTOR
+	write_pda(stack_canary, next_p->stack_canary);
+
+	/*
+	 * Build time only check to make sure the stack_canary is at
+	 * offset 40 in the pda; this is a gcc ABI requirement
+	 */
+	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
+#endif
 
 	/*
 	 * Now maybe reload the debug registers
 	 */
-	if (unlikely(next->debugreg7)) {
-		set_debugreg(next->debugreg0, 0);
-		set_debugreg(next->debugreg1, 1);
-		set_debugreg(next->debugreg2, 2);
-		set_debugreg(next->debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(next->debugreg6, 6);
-		set_debugreg(next->debugreg7, 7);
-	}
+	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
+		__switch_to_xtra(prev_p, next_p);
 
+	/* If the task has used fpu the last 5 timeslices, just do a full
+	 * restore of the math state immediately to avoid the trap; the
+	 * chances of needing FPU soon are obviously high now
+	 */
+	if (next_p->fpu_counter>5)
+		math_state_restore();
 	return prev_p;
 }
 
@@ -821,7 +865,7 @@ int dump_task_regs(struct task_struct *t
 
 unsigned long arch_align_stack(unsigned long sp)
 {
-	if (randomize_va_space)
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
 		sp -= get_random_int() % 8192;
 	return sp & ~0xf;
 }
Index: head-2008-07-15/arch/x86/kernel/setup_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/setup_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/setup_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -118,16 +118,6 @@ EXPORT_SYMBOL(boot_cpu_data);
 
 unsigned long mmu_cr4_features;
 
-int acpi_disabled;
-EXPORT_SYMBOL(acpi_disabled);
-#ifdef	CONFIG_ACPI
-extern int __initdata acpi_ht;
-extern acpi_interrupt_flags	acpi_sci_flags;
-int __initdata acpi_force = 0;
-#endif
-
-int acpi_numa __initdata;
-
 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
 int bootloader_type;
 
@@ -151,10 +141,6 @@ struct sys_desc_table_struct {
 
 struct edid_info edid_info;
 EXPORT_SYMBOL_GPL(edid_info);
-struct e820map e820;
-#ifdef CONFIG_XEN
-struct e820map machine_e820;
-#endif
 
 extern int root_mountflags;
 
@@ -181,9 +167,6 @@ struct resource standard_io_resources[] 
 		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
 };
 
-#define STANDARD_IO_RESOURCES \
-	(sizeof standard_io_resources / sizeof standard_io_resources[0])
-
 #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
 
 struct resource data_resource = {
@@ -230,9 +213,6 @@ static struct resource adapter_rom_resou
 		.flags = IORESOURCE_ROM }
 };
 
-#define ADAPTER_ROM_RESOURCES \
-	(sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
-
 static struct resource video_rom_resource = {
 	.name = "Video ROM",
 	.start = 0xc0000,
@@ -309,7 +289,8 @@ static void __init probe_roms(void)
 	}
 
 	/* check for adapter roms on 2k boundaries */
-	for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
+	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper;
+	     start += 2048) {
 		rom = isa_bus_to_virt(start);
 		if (!romsignature(rom))
 			continue;
@@ -329,187 +310,22 @@ static void __init probe_roms(void)
 	}
 }
 
-/* Check for full argument with no trailing characters */
-static int fullarg(char *p, char *arg)
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
 {
-	int l = strlen(arg);
-	return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l]));
+	char *end;
+	if (!arg)
+		return -EINVAL;
+	elfcorehdr_addr = memparse(arg, &end);
+	return end > arg ? 0 : -EINVAL;
 }
-
-static __init void parse_cmdline_early (char ** cmdline_p)
-{
-	char c = ' ', *to = command_line, *from = COMMAND_LINE;
-	int len = 0;
-	int userdef = 0;
-
-	for (;;) {
-		if (c != ' ') 
-			goto next_char; 
-
-#ifdef  CONFIG_SMP
-		/*
-		 * If the BIOS enumerates physical processors before logical,
-		 * maxcpus=N at enumeration-time can be used to disable HT.
-		 */
-		else if (!memcmp(from, "maxcpus=", 8)) {
-			extern unsigned int maxcpus;
-
-			maxcpus = simple_strtoul(from + 8, NULL, 0);
-		}
-#endif
-#ifdef CONFIG_ACPI
-		/* "acpi=off" disables both ACPI table parsing and interpreter init */
-		if (fullarg(from,"acpi=off"))
-			disable_acpi();
-
-		if (fullarg(from, "acpi=force")) { 
-			/* add later when we do DMI horrors: */
-			acpi_force = 1;
-			acpi_disabled = 0;
-		}
-
-		/* acpi=ht just means: do ACPI MADT parsing 
-		   at bootup, but don't enable the full ACPI interpreter */
-		if (fullarg(from, "acpi=ht")) { 
-			if (!acpi_force)
-				disable_acpi();
-			acpi_ht = 1; 
-		}
-                else if (fullarg(from, "pci=noacpi")) 
-			acpi_disable_pci();
-		else if (fullarg(from, "acpi=noirq"))
-			acpi_noirq_set();
-
-		else if (fullarg(from, "acpi_sci=edge"))
-			acpi_sci_flags.trigger =  1;
-		else if (fullarg(from, "acpi_sci=level"))
-			acpi_sci_flags.trigger = 3;
-		else if (fullarg(from, "acpi_sci=high"))
-			acpi_sci_flags.polarity = 1;
-		else if (fullarg(from, "acpi_sci=low"))
-			acpi_sci_flags.polarity = 3;
-
-		/* acpi=strict disables out-of-spec workarounds */
-		else if (fullarg(from, "acpi=strict")) {
-			acpi_strict = 1;
-		}
-#ifdef CONFIG_X86_IO_APIC
-		else if (fullarg(from, "acpi_skip_timer_override"))
-			acpi_skip_timer_override = 1;
-#endif
-#endif
-
-#ifndef CONFIG_XEN
-		if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
-			clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-			disable_apic = 1;
-		}
-
-		if (fullarg(from, "noapic"))
-			skip_ioapic_setup = 1;
-
-		if (fullarg(from,"apic")) {
-			skip_ioapic_setup = 0;
-			ioapic_force = 1;
-		}
-#endif
-			
-		if (!memcmp(from, "mem=", 4))
-			parse_memopt(from+4, &from); 
-
-		if (!memcmp(from, "memmap=", 7)) {
-			/* exactmap option is for used defined memory */
-			if (!memcmp(from+7, "exactmap", 8)) {
-#ifdef CONFIG_CRASH_DUMP
-				/* If we are doing a crash dump, we
-				 * still need to know the real mem
-				 * size before original memory map is
-				 * reset.
-				 */
-				saved_max_pfn = e820_end_of_ram();
-#endif
-				from += 8+7;
-				end_pfn_map = 0;
-				e820.nr_map = 0;
-				userdef = 1;
-			}
-			else {
-				parse_memmapopt(from+7, &from);
-				userdef = 1;
-			}
-		}
-
-#ifdef CONFIG_NUMA
-		if (!memcmp(from, "numa=", 5))
-			numa_setup(from+5); 
+early_param("elfcorehdr", setup_elfcorehdr);
 #endif
 
-		if (!memcmp(from,"iommu=",6)) { 
-			iommu_setup(from+6); 
-		}
-
-		if (fullarg(from,"oops=panic"))
-			panic_on_oops = 1;
-
-		if (!memcmp(from, "noexec=", 7))
-			nonx_setup(from + 7);
-
-#ifdef CONFIG_KEXEC
-		/* crashkernel=size@addr specifies the location to reserve for
-		 * a crash kernel.  By reserving this memory we guarantee
-		 * that linux never set's it up as a DMA target.
-		 * Useful for holding code to do something appropriate
-		 * after a kernel panic.
-		 */
-		else if (!memcmp(from, "crashkernel=", 12)) {
-#ifndef CONFIG_XEN
-			unsigned long size, base;
-			size = memparse(from+12, &from);
-			if (*from == '@') {
-				base = memparse(from+1, &from);
-				/* FIXME: Do I want a sanity check
-				 * to validate the memory range?
-				 */
-				crashk_res.start = base;
-				crashk_res.end   = base + size - 1;
-			}
-#else
-			printk("Ignoring crashkernel command line, "
-			       "parameter will be supplied by xen\n");
-#endif
-		}
-#endif
-
-#ifdef CONFIG_PROC_VMCORE
-		/* elfcorehdr= specifies the location of elf core header
-		 * stored by the crashed kernel. This option will be passed
-		 * by kexec loader to the capture kernel.
-		 */
-		else if(!memcmp(from, "elfcorehdr=", 11))
-			elfcorehdr_addr = memparse(from+11, &from);
-#endif
-
-#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
-		else if (!memcmp(from, "additional_cpus=", 16))
-			setup_additional_cpus(from+16);
-#endif
-
-	next_char:
-		c = *(from++);
-		if (!c)
-			break;
-		if (COMMAND_LINE_SIZE <= ++len)
-			break;
-		*(to++) = c;
-	}
-	if (userdef) {
-		printk(KERN_INFO "user-defined physical RAM map:\n");
-		e820_print_map("user");
-	}
-	*to = '\0';
-	*cmdline_p = command_line;
-}
-
 #ifndef CONFIG_NUMA
 static void __init
 contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
@@ -521,10 +337,11 @@ contig_initmem_init(unsigned long start_
 	if (bootmap == -1L)
 		panic("Cannot find bootmem map of size %ld\n",bootmap_size);
 	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
+	e820_register_active_regions(0, start_pfn, end_pfn);
 #ifdef CONFIG_XEN
-	e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<<PAGE_SHIFT);
+	free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
 #else
-	e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
+	free_bootmem_with_active_regions(0, end_pfn);
 #endif
 	reserve_bootmem(bootmap, bootmap_size);
 } 
@@ -587,6 +404,10 @@ static void discover_ebda(void)
 void __init setup_arch(char **cmdline_p)
 {
 #ifdef CONFIG_XEN
+	extern struct e820map machine_e820;
+
+	printk(KERN_INFO "Command line: %s\n", saved_command_line);
+
 	/* Register a call for panic conditions. */
 	atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
 
@@ -612,6 +433,8 @@ void __init setup_arch(char **cmdline_p)
 
 	ARCH_SETUP
 #else
+	printk(KERN_INFO "Command line: %s\n", saved_command_line);
+
  	ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
  	screen_info = SCREEN_INFO;
 	edid_info = EDID_INFO;
@@ -639,16 +462,22 @@ void __init setup_arch(char **cmdline_p)
 	data_resource.start = virt_to_phys(&_etext);
 	data_resource.end = virt_to_phys(&_edata)-1;
 
-	parse_cmdline_early(cmdline_p);
-
 	early_identify_cpu(&boot_cpu_data);
 
+	strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
+	*cmdline_p = command_line;
+
+	parse_early_param();
+
+	finish_e820_parsing();
+
+	e820_register_active_regions(0, 0, -1UL);
 	/*
 	 * partially used pages are not usable - thus
 	 * we are rounding upwards:
 	 */
 	end_pfn = e820_end_of_ram();
-	num_physpages = end_pfn;		/* for pfn_valid */
+	num_physpages = end_pfn;
 
 	check_efer();
 
@@ -659,6 +488,14 @@ void __init setup_arch(char **cmdline_p)
 	if (is_initial_xendomain())
 		dmi_scan_machine();
 
+	/* How many end-of-memory variables you have, grandma! */
+	max_low_pfn = end_pfn;
+	max_pfn = end_pfn;
+	high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
+
+	/* Remove active ranges so rediscovery with NUMA-awareness happens */
+	remove_all_active_ranges();
+
 #ifdef CONFIG_ACPI_NUMA
 	/*
 	 * Parse SRAT to discover nodes.
@@ -848,16 +685,16 @@ void __init setup_arch(char **cmdline_p)
 				BUG();
 	}
 
+#ifdef CONFIG_ACPI
 	if (!is_initial_xendomain()) {
 		acpi_disabled = 1;
-#ifdef  CONFIG_ACPI
 		acpi_ht = 0;
-#endif
 	}
 #endif
+#endif
 
-#ifndef CONFIG_XEN
-	check_ioapic();
+#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
+	early_quirks();
 #endif
 
 	zap_low_mappings(0);
@@ -907,6 +744,7 @@ void __init setup_arch(char **cmdline_p)
 		e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
 #else
 	e820_reserve_resources(e820.map, e820.nr_map);
+	e820_mark_nosave_regions();
 #endif
 
 	request_resource(&iomem_resource, &video_ram_resource);
@@ -914,7 +752,7 @@ void __init setup_arch(char **cmdline_p)
 	{
 	unsigned i;
 	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < STANDARD_IO_RESOURCES; i++)
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
 		request_resource(&ioport_resource, &standard_io_resources[i]);
 	}
 
@@ -1098,7 +936,7 @@ static void __init amd_detect_cmp(struct
 #endif
 }
 
-static void __init init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 {
 	unsigned level;
 
@@ -1154,6 +992,12 @@ static void __init init_amd(struct cpuin
 
 	/* Fix cpuid4 emulation for more */
 	num_cache_leaves = 3;
+
+	/* When there is only one core no need to synchronize RDTSC */
+	if (num_possible_cpus() == 1)
+	        set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+	else
+	        clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
 }
 
 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -1235,8 +1079,7 @@ static void srat_detect_node(void)
 		node = first_node(node_online_map);
 	numa_set_node(cpu, node);
 
-	if (acpi_numa > 0)
-		printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
 #endif
 }
 
@@ -1270,6 +1113,8 @@ static void __cpuinit init_intel(struct 
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
 	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
 		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+	if (c->x86 == 6)
+		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
 	set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
  	c->x86_max_cores = intel_num_cpu_cores(c);
 
@@ -1488,8 +1333,8 @@ static int show_cpuinfo(struct seq_file 
 
 		/* Intel-defined (#2) */
 		"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
-		"tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
+		NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
 		/* VIA/Cyrix/Centaur-defined */
Index: head-2008-07-15/arch/x86/kernel/setup64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/setup64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/setup64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -31,7 +31,7 @@
 #include <asm/hypervisor.h>
 #endif
 
-char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
+char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
 
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
@@ -55,8 +55,10 @@ Control non executable mappings for 64bi
 on	Enable(default)
 off	Disable
 */ 
-int __init nonx_setup(char *str)
+static int __init nonx_setup(char *str)
 {
+	if (!str)
+		return -EINVAL;
 	if (!strncmp(str, "on", 2)) {
                 __supported_pte_mask |= _PAGE_NX; 
  		do_not_nx = 0; 
@@ -64,9 +66,9 @@ int __init nonx_setup(char *str)
 		do_not_nx = 1;
 		__supported_pte_mask &= ~_PAGE_NX;
         }
-	return 1;
+	return 0;
 } 
-__setup("noexec=", nonx_setup);	/* parsed early actually */
+early_param("noexec", nonx_setup);
 
 int force_personality32 = 0; 
 
@@ -102,12 +104,9 @@ void __init setup_per_cpu_areas(void)
 #endif
 
 	/* Copy section for each CPU (we discard the original) */
-	size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
-#ifdef CONFIG_MODULES
-	if (size < PERCPU_ENOUGH_ROOM)
-		size = PERCPU_ENOUGH_ROOM;
-#endif
+	size = PERCPU_ENOUGH_ROOM;
 
+	printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
 	for_each_cpu_mask (i, cpu_possible_map) {
 		char *ptr;
 
@@ -169,7 +168,10 @@ void pda_init(int cpu)
 	/* Setup up data that may be needed in __get_free_pages early */
 	asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
 #ifndef CONFIG_XEN
+	/* Memory clobbers used to order PDA accessed */
+	mb();
 	wrmsrl(MSR_GS_BASE, pda);
+	mb();
 #else
 	if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
 					(unsigned long)pda))
@@ -302,28 +304,17 @@ void __cpuinit cpu_init (void)
 	 * set up and load the per-CPU TSS
 	 */
 	for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+		static const unsigned int order[N_EXCEPTION_STACKS] = {
+			[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+			[DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+		};
 		if (cpu) {
-			static const unsigned int order[N_EXCEPTION_STACKS] = {
-				[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
-				[DEBUG_STACK - 1] = DEBUG_STACK_ORDER
-			};
-
 			estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
 			if (!estacks)
 				panic("Cannot allocate exception stack %ld %d\n",
 				      v, cpu); 
 		}
-		switch (v + 1) {
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
-		case DEBUG_STACK:
-			cpu_pda(cpu)->debugstack = (unsigned long)estacks;
-			estacks += DEBUG_STKSZ;
-			break;
-#endif
-		default:
-			estacks += EXCEPTION_STKSZ;
-			break;
-		}
+		estacks += PAGE_SIZE << order[v];
 		orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
 	}
 
Index: head-2008-07-15/arch/x86/kernel/smp_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/smp_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/smp_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -381,9 +381,8 @@ int smp_call_function_single (int cpu, v
 	/* prevent preemption and reschedule on another processor */
 	int me = get_cpu();
 	if (cpu == me) {
-		WARN_ON(1);
 		put_cpu();
-		return -EBUSY;
+		return 0;
 	}
 	spin_lock_bh(&call_lock);
 	__smp_call_function_single(cpu, func, info, nonatomic, wait);
@@ -501,7 +500,7 @@ void smp_send_stop(void)
 #ifndef CONFIG_XEN
 asmlinkage void smp_reschedule_interrupt(void)
 #else
-asmlinkage irqreturn_t smp_reschedule_interrupt(void)
+asmlinkage irqreturn_t smp_reschedule_interrupt(int irq, void *ctx)
 #endif
 {
 #ifndef CONFIG_XEN
@@ -514,7 +513,7 @@ asmlinkage irqreturn_t smp_reschedule_in
 #ifndef CONFIG_XEN
 asmlinkage void smp_call_function_interrupt(void)
 #else
-asmlinkage irqreturn_t smp_call_function_interrupt(void)
+asmlinkage irqreturn_t smp_call_function_interrupt(int irq, void *ctx)
 #endif
 {
 	void (*func) (void *info) = call_data->func;
@@ -545,31 +544,3 @@ asmlinkage irqreturn_t smp_call_function
 	return IRQ_HANDLED;
 #endif
 }
-
-int safe_smp_processor_id(void)
-{
-#ifdef CONFIG_XEN
-	return smp_processor_id();
-#else
-	unsigned apicid, i;
-
-	if (disable_apic)
-		return 0;
-
-	apicid = hard_smp_processor_id();
-	if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)
-		return apicid;
-
-	for (i = 0; i < NR_CPUS; ++i) {
-		if (x86_cpu_to_apicid[i] == apicid)
-			return i;
-	}
-
-	/* No entries in x86_cpu_to_apicid?  Either no MPS|ACPI,
-	 * or called too early.  Either way, we must be CPU 0. */
-      	if (x86_cpu_to_apicid[0] == BAD_APICID)
-		return 0;
-
-	return 0; /* Should not happen */
-#endif
-}
Index: head-2008-07-15/arch/x86/kernel/traps_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/traps_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/traps_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -23,6 +23,7 @@
 #include <linux/delay.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/kallsyms.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/nmi.h>
@@ -45,6 +46,7 @@
 #include <asm/pda.h>
 #include <asm/proto.h>
 #include <asm/nmi.h>
+#include <asm/stacktrace.h>
 
 asmlinkage void divide_error(void);
 asmlinkage void debug(void);
@@ -114,7 +116,6 @@ static int call_trace = 1;
 #endif
 
 #ifdef CONFIG_KALLSYMS
-# include <linux/kallsyms.h>
 void printk_address(unsigned long address)
 {
 	unsigned long offset = 0, symsize;
@@ -142,7 +143,7 @@ void printk_address(unsigned long addres
 #endif
 
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
-					unsigned *usedp, const char **idp)
+					unsigned *usedp, char **idp)
 {
 #ifndef CONFIG_X86_NO_TSS
 	static char ids[][8] = {
@@ -162,26 +163,7 @@ static unsigned long *in_exception_stack
 	 * 'stack' is in one of them:
 	 */
 	for (k = 0; k < N_EXCEPTION_STACKS; k++) {
-		unsigned long end;
-
-		/*
-		 * set 'end' to the end of the exception stack.
-		 */
-		switch (k + 1) {
-		/*
-		 * TODO: this block is not needed i think, because
-		 * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK]
-		 * properly too.
-		 */
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
-		case DEBUG_STACK:
-			end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
-			break;
-#endif
-		default:
-			end = per_cpu(orig_ist, cpu).ist[k];
-			break;
-		}
+		unsigned long end = per_cpu(orig_ist, cpu).ist[k];
 		/*
 		 * Is 'stack' above this exception frame's end?
 		 * If yes then skip to the next frame.
@@ -236,13 +218,19 @@ static unsigned long *in_exception_stack
 	return NULL;
 }
 
-static int show_trace_unwind(struct unwind_frame_info *info, void *context)
+struct ops_and_data {
+	struct stacktrace_ops *ops;
+	void *data;
+};
+
+static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
 {
+	struct ops_and_data *oad = (struct ops_and_data *)context;
 	int n = 0;
 
 	while (unwind(info) == 0 && UNW_PC(info)) {
 		n++;
-		printk_address(UNW_PC(info));
+		oad->ops->address(oad->data, UNW_PC(info));
 		if (arch_unw_user_mode(info))
 			break;
 	}
@@ -256,13 +244,19 @@ static int show_trace_unwind(struct unwi
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */
 
-void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack)
+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
 {
-	const unsigned cpu = safe_smp_processor_id();
+	void *t = (void *)tinfo;
+        return p > t && p < t + THREAD_SIZE - 3;
+}
+
+void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
+		struct stacktrace_ops *ops, void *data)
+{
+	const unsigned cpu = smp_processor_id();
 	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
 	unsigned used = 0;
-
-	printk("\nCall Trace:\n");
+	struct thread_info *tinfo;
 
 	if (!tsk)
 		tsk = current;
@@ -270,32 +264,47 @@ void show_trace(struct task_struct *tsk,
 	if (call_trace >= 0) {
 		int unw_ret = 0;
 		struct unwind_frame_info info;
+		struct ops_and_data oad = { .ops = ops, .data = data };
 
 		if (regs) {
 			if (unwind_init_frame_info(&info, tsk, regs) == 0)
-				unw_ret = show_trace_unwind(&info, NULL);
+				unw_ret = dump_trace_unwind(&info, &oad);
 		} else if (tsk == current)
-			unw_ret = unwind_init_running(&info, show_trace_unwind, NULL);
+			unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
 		else {
 			if (unwind_init_blocked(&info, tsk) == 0)
-				unw_ret = show_trace_unwind(&info, NULL);
+				unw_ret = dump_trace_unwind(&info, &oad);
 		}
 		if (unw_ret > 0) {
 			if (call_trace == 1 && !arch_unw_user_mode(&info)) {
-				print_symbol("DWARF2 unwinder stuck at %s\n",
+				ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
 					     UNW_PC(&info));
 				if ((long)UNW_SP(&info) < 0) {
-					printk("Leftover inexact backtrace:\n");
+					ops->warning(data, "Leftover inexact backtrace:\n");
 					stack = (unsigned long *)UNW_SP(&info);
+					if (!stack)
+						return;
 				} else
-					printk("Full inexact backtrace again:\n");
+					ops->warning(data, "Full inexact backtrace again:\n");
 			} else if (call_trace >= 1)
 				return;
 			else
-				printk("Full inexact backtrace again:\n");
+				ops->warning(data, "Full inexact backtrace again:\n");
 		} else
-			printk("Inexact backtrace:\n");
+			ops->warning(data, "Inexact backtrace:\n");
+	}
+	if (!stack) {
+		unsigned long dummy;
+		stack = &dummy;
+		if (tsk && tsk != current)
+			stack = (unsigned long *)tsk->thread.rsp;
 	}
+	/*
+	 * Align the stack pointer on word boundary, later loops
+	 * rely on that (and corruption / debug info bugs can cause
+	 * unaligned values here):
+	 */
+	stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1));
 
 	/*
 	 * Print function call entries within a stack. 'cond' is the
@@ -305,7 +314,9 @@ void show_trace(struct task_struct *tsk,
 #define HANDLE_STACK(cond) \
 	do while (cond) { \
 		unsigned long addr = *stack++; \
-		if (kernel_text_address(addr)) { \
+		if (oops_in_progress ? 		\
+			__kernel_text_address(addr) : \
+			kernel_text_address(addr)) { \
 			/* \
 			 * If the address is either in the text segment of the \
 			 * kernel, or in the region which contains vmalloc'ed \
@@ -314,7 +325,7 @@ void show_trace(struct task_struct *tsk,
 			 * down the cause of the crash will be able to figure \
 			 * out the call path that was taken. \
 			 */ \
-			printk_address(addr); \
+			ops->address(data, addr);   \
 		} \
 	} while (0)
 
@@ -323,16 +334,17 @@ void show_trace(struct task_struct *tsk,
 	 * current stack address. If the stacks consist of nested
 	 * exceptions
 	 */
-	for ( ; ; ) {
-		const char *id;
+	for (;;) {
+		char *id;
 		unsigned long *estack_end;
 		estack_end = in_exception_stack(cpu, (unsigned long)stack,
 						&used, &id);
 
 		if (estack_end) {
-			printk(" <%s>", id);
+			if (ops->stack(data, id) < 0)
+				break;
 			HANDLE_STACK (stack < estack_end);
-			printk(" <EOE>");
+			ops->stack(data, "<EOE>");
 			/*
 			 * We link to the next stack via the
 			 * second-to-last pointer (index -2 to end) in the
@@ -347,7 +359,8 @@ void show_trace(struct task_struct *tsk,
 				(IRQSTACKSIZE - 64) / sizeof(*irqstack);
 
 			if (stack >= irqstack && stack < irqstack_end) {
-				printk(" <IRQ>");
+				if (ops->stack(data, "IRQ") < 0)
+					break;
 				HANDLE_STACK (stack < irqstack_end);
 				/*
 				 * We link to the next stack (which would be
@@ -356,7 +369,7 @@ void show_trace(struct task_struct *tsk,
 				 */
 				stack = (unsigned long *) (irqstack_end[-1]);
 				irqstack_end = NULL;
-				printk(" <EOI>");
+				ops->stack(data, "EOI");
 				continue;
 			}
 		}
@@ -364,19 +377,58 @@ void show_trace(struct task_struct *tsk,
 	}
 
 	/*
-	 * This prints the process stack:
+	 * This handles the process stack:
 	 */
-	HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
+	tinfo = current_thread_info();
+	HANDLE_STACK (valid_stack_ptr(tinfo, stack));
 #undef HANDLE_STACK
+}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	print_symbol(msg, symbol);
+	printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+	printk("%s\n", msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+	printk(" <%s> ", name);
+	return 0;
+}
+
+static void print_trace_address(void *data, unsigned long addr)
+{
+	printk_address(addr);
+}
+
+static struct stacktrace_ops print_trace_ops = {
+	.warning = print_trace_warning,
+	.warning_symbol = print_trace_warning_symbol,
+	.stack = print_trace_stack,
+	.address = print_trace_address,
+};
 
+void
+show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
+{
+	printk("\nCall Trace:\n");
+	dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
 	printk("\n");
 }
 
-static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)
+static void
+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
 {
 	unsigned long *stack;
 	int i;
-	const int cpu = safe_smp_processor_id();
+	const int cpu = smp_processor_id();
 	unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
 	unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
 
@@ -430,7 +482,7 @@ void show_registers(struct pt_regs *regs
 	int i;
 	int in_kernel = !user_mode(regs);
 	unsigned long rsp;
-	const int cpu = safe_smp_processor_id(); 
+	const int cpu = smp_processor_id();
 	struct task_struct *cur = cpu_pda(cpu)->pcurrent;
 
 		rsp = regs->rsp;
@@ -505,9 +557,11 @@ static unsigned int die_nest_count;
 
 unsigned __kprobes long oops_begin(void)
 {
-	int cpu = safe_smp_processor_id();
+	int cpu = smp_processor_id();
 	unsigned long flags;
 
+	oops_enter();
+
 	/* racy, but better than risking deadlock. */
 	local_irq_save(flags);
 	if (!spin_trylock(&die_lock)) { 
@@ -536,6 +590,7 @@ void __kprobes oops_end(unsigned long fl
 		spin_unlock_irqrestore(&die_lock, flags);
 	if (panic_on_oops)
 		panic("Fatal exception");
+	oops_exit();
 }
 
 void __kprobes __die(const char * str, struct pt_regs * regs, long err)
@@ -573,7 +628,7 @@ void die(const char * str, struct pt_reg
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
-void __kprobes die_nmi(char *str, struct pt_regs *regs)
+void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
 {
 	unsigned long flags = oops_begin();
 
@@ -581,13 +636,12 @@ void __kprobes die_nmi(char *str, struct
 	 * We are in trouble anyway, lets at least try
 	 * to get a message out.
 	 */
-	printk(str, safe_smp_processor_id());
+	printk(str, smp_processor_id());
 	show_registers(regs);
 	if (kexec_should_crash(current))
 		crash_kexec(regs);
-	if (panic_on_timeout || panic_on_oops)
-		panic("nmi watchdog");
-	printk("console shuts up ...\n");
+	if (do_panic || panic_on_oops)
+		panic("Non maskable interrupt");
 	oops_end(flags);
 	nmi_exit();
 	local_irq_enable();
@@ -734,8 +788,15 @@ asmlinkage void __kprobes do_general_pro
 static __kprobes void
 mem_parity_error(unsigned char reason, struct pt_regs * regs)
 {
-	printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
-	printk("You probably have a hardware problem with your RAM chips\n");
+	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+		reason);
+	printk(KERN_EMERG "You probably have a hardware problem with your "
+		"RAM chips\n");
+
+	if (panic_on_unrecovered_nmi)
+		panic("NMI: Not continuing");
+
+	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
 
 #if 0 /* XEN */
 	/* Clear and disable the memory parity error line. */
@@ -762,9 +823,15 @@ io_check_error(unsigned char reason, str
 
 static __kprobes void
 unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-{	printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
-	printk("Dazed and confused, but trying to continue\n");
-	printk("Do you have a strange power saving mode enabled?\n");
+{
+	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+		reason);
+	printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+
+	if (panic_on_unrecovered_nmi)
+		panic("NMI: Not continuing");
+
+	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
 }
 
 /* Runs on IST stack. This code must keep interrupts off all the time.
@@ -789,12 +856,12 @@ asmlinkage __kprobes void default_do_nmi
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
 		 */
-		if (nmi_watchdog > 0) {
-			nmi_watchdog_tick(regs,reason);
+		if (nmi_watchdog_tick(regs,reason))
 			return;
-		}
 #endif
-		unknown_nmi_error(reason, regs);
+		if (!do_nmi_callback(regs,cpu))
+			unknown_nmi_error(reason, regs);
+
 		return;
 	}
 	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
@@ -1081,6 +1148,7 @@ asmlinkage void math_state_restore(void)
 		init_fpu(me);
 	restore_fpu_checking(&me->thread.i387.fxsave);
 	task_thread_info(me)->status |= TS_USEDFPU;
+	me->fpu_counter++;
 }
 
 
@@ -1141,24 +1209,30 @@ void __cpuinit smp_trap_init(trap_info_t
 }
 
 
-/* Actual parsing is done early in setup.c. */
-static int __init oops_dummy(char *s)
+static int __init oops_setup(char *s)
 { 
-	panic_on_oops = 1;
-	return 1;
+	if (!s)
+		return -EINVAL;
+	if (!strcmp(s, "panic"))
+		panic_on_oops = 1;
+	return 0;
 } 
-__setup("oops=", oops_dummy); 
+early_param("oops", oops_setup);
 
 static int __init kstack_setup(char *s)
 {
+	if (!s)
+		return -EINVAL;
 	kstack_depth_to_print = simple_strtoul(s,NULL,0);
-	return 1;
+	return 0;
 }
-__setup("kstack=", kstack_setup);
+early_param("kstack", kstack_setup);
 
 #ifdef CONFIG_STACK_UNWIND
 static int __init call_trace_setup(char *s)
 {
+	if (!s)
+		return -EINVAL;
 	if (strcmp(s, "old") == 0)
 		call_trace = -1;
 	else if (strcmp(s, "both") == 0)
@@ -1167,7 +1241,7 @@ static int __init call_trace_setup(char 
 		call_trace = 1;
 	else if (strcmp(s, "new") == 0)
 		call_trace = 2;
-	return 1;
+	return 0;
 }
-__setup("call_trace=", call_trace_setup);
+early_param("call_trace", call_trace_setup);
 #endif
Index: head-2008-07-15/arch/x86/kernel/vsyscall_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/kernel/vsyscall_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/kernel/vsyscall_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -26,6 +26,10 @@
 #include <linux/seqlock.h>
 #include <linux/jiffies.h>
 #include <linux/sysctl.h>
+#include <linux/getcpu.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>
 
 #include <asm/vsyscall.h>
 #include <asm/pgtable.h>
@@ -33,11 +37,15 @@
 #include <asm/fixmap.h>
 #include <asm/errno.h>
 #include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+int __vgetcpu_mode __section_vgetcpu_mode;
 
 #include <asm/unistd.h>
 
@@ -61,8 +69,7 @@ static __always_inline void do_vgettimeo
 		sequence = read_seqbegin(&__xtime_lock);
 		
 		sec = __xtime.tv_sec;
-		usec = (__xtime.tv_nsec / 1000) +
-			(__jiffies - __wall_jiffies) * (1000000 / HZ);
+		usec = __xtime.tv_nsec / 1000;
 
 		if (__vxtime.mode != VXTIME_HPET) {
 			t = get_cycles_sync();
@@ -72,7 +79,8 @@ static __always_inline void do_vgettimeo
 				 __vxtime.tsc_quot) >> 32;
 			/* See comment in x86_64 do_gettimeofday. */
 		} else {
-			usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
+			usec += ((readl((void __iomem *)
+				   fix_to_virt(VSYSCALL_HPET) + 0xf0) -
 				  __vxtime.last) * __vxtime.quot) >> 32;
 		}
 	} while (read_seqretry(&__xtime_lock, sequence));
@@ -127,9 +135,46 @@ time_t __vsyscall(1) vtime(time_t *t)
 	return __xtime.tv_sec;
 }
 
-long __vsyscall(2) venosys_0(void)
-{
-	return -ENOSYS;
+/* Fast way to get current CPU and node.
+   This helps to do per node and per CPU caches in user space.
+   The result is not guaranteed without CPU affinity, but usually
+   works out because the scheduler tries to keep a thread on the same
+   CPU.
+
+   tcache must point to a two element sized long array.
+   All arguments can be NULL. */
+long __vsyscall(2)
+vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+{
+	unsigned int dummy, p;
+	unsigned long j = 0;
+
+	/* Fast cache - only recompute value once per jiffies and avoid
+	   relatively costly rdtscp/cpuid otherwise.
+	   This works because the scheduler usually keeps the process
+	   on the same CPU and this syscall doesn't guarantee its
+	   results anyways.
+	   We do this here because otherwise user space would do it on
+	   its own in a likely inferior way (no access to jiffies).
+	   If you don't like it pass NULL. */
+	if (tcache && tcache->blob[0] == (j = __jiffies)) {
+		p = tcache->blob[1];
+	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
+		/* Load per CPU data from RDTSCP */
+		rdtscp(dummy, dummy, p);
+	} else {
+		/* Load per CPU data from GDT */
+		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+	}
+	if (tcache) {
+		tcache->blob[0] = j;
+		tcache->blob[1] = p;
+	}
+	if (cpu)
+		*cpu = p & 0xfff;
+	if (node)
+		*node = p >> 12;
+	return 0;
 }
 
 long __vsyscall(3) venosys_1(void)
@@ -149,7 +194,8 @@ static int vsyscall_sysctl_change(ctl_ta
                         void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	extern u16 vsysc1, vsysc2;
-	u16 *map1, *map2;
+	u16 __iomem *map1;
+	u16 __iomem *map2;
 	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
 	if (!write)
 		return ret;
@@ -164,11 +210,11 @@ static int vsyscall_sysctl_change(ctl_ta
 		goto out;
 	}
 	if (!sysctl_vsyscall) {
-		*map1 = SYSCALL;
-		*map2 = SYSCALL;
+		writew(SYSCALL, map1);
+		writew(SYSCALL, map2);
 	} else {
-		*map1 = NOP2;
-		*map2 = NOP2;
+		writew(NOP2, map1);
+		writew(NOP2, map2);
 	}
 	iounmap(map2);
 out:
@@ -200,6 +246,48 @@ static ctl_table kernel_root_table2[] = 
 
 #endif
 
+/* Assume __initcall executes before all user space. Hopefully kmod
+   doesn't violate that. We'll find out if it does. */
+static void __cpuinit vsyscall_set_cpu(int cpu)
+{
+	unsigned long d;
+	unsigned long node = 0;
+#ifdef CONFIG_NUMA
+	node = cpu_to_node[cpu];
+#endif
+	if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
+		write_rdtscp_aux((node << 12) | cpu);
+
+	/* Store cpu number in limit so that it can be loaded quickly
+	   in user space in vgetcpu.
+	   12 bits for the CPU and 8 bits for the node. */
+	d = 0x0f40000000000ULL;
+	d |= cpu;
+	d |= (node & 0xf) << 12;
+	d |= (node >> 4) << 48;
+	if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu)
+							 + GDT_ENTRY_PER_CPU),
+					 d))
+		BUG();
+}
+
+static void __cpuinit cpu_vsyscall_init(void *arg)
+{
+	/* preemption should be already off */
+	vsyscall_set_cpu(raw_smp_processor_id());
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int __cpuinit
+cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
+{
+	long cpu = (long)arg;
+	if (action == CPU_ONLINE)
+		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
+	return NOTIFY_DONE;
+}
+#endif
+
 static void __init map_vsyscall(void)
 {
 	extern char __vsyscall_0;
@@ -214,13 +302,20 @@ static int __init vsyscall_init(void)
 			VSYSCALL_ADDR(__NR_vgettimeofday)));
 	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
+	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
 	map_vsyscall();
 #ifdef CONFIG_XEN
 	sysctl_vsyscall = 0; /* disable vgettimeofay() */
+ 	if (boot_cpu_has(X86_FEATURE_RDTSCP))
+		vgetcpu_mode = VGETCPU_RDTSCP;
+	else
+		vgetcpu_mode = VGETCPU_LSL;
 #endif
 #ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2, 0);
 #endif
+	on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
+	hotcpu_notifier(cpu_vsyscall_notifier, 0);
 	return 0;
 }
 
Index: head-2008-07-15/arch/x86/mm/fault_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/mm/fault_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/mm/fault_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -40,8 +40,7 @@
 #define PF_RSVD	(1<<3)
 #define PF_INSTR	(1<<4)
 
-#ifdef CONFIG_KPROBES
-ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
 
 /* Hook to register for page fault notifications */
 int register_page_fault_notifier(struct notifier_block *nb)
@@ -49,11 +48,13 @@ int register_page_fault_notifier(struct 
 	vmalloc_sync_all();
 	return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
 }
+EXPORT_SYMBOL_GPL(register_page_fault_notifier);
 
 int unregister_page_fault_notifier(struct notifier_block *nb)
 {
 	return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
 }
+EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
 
 static inline int notify_page_fault(enum die_val val, const char *str,
 			struct pt_regs *regs, long err, int trap, int sig)
@@ -67,13 +68,6 @@ static inline int notify_page_fault(enum
 	};
 	return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
 }
-#else
-static inline int notify_page_fault(enum die_val val, const char *str,
-			struct pt_regs *regs, long err, int trap, int sig)
-{
-	return NOTIFY_DONE;
-}
-#endif
 
 void bust_spinlocks(int yes)
 {
@@ -102,7 +96,7 @@ void bust_spinlocks(int yes)
 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 				unsigned long error_code)
 { 
-	unsigned char *instr;
+	unsigned char __user *instr;
 	int scan_more = 1;
 	int prefetch = 0; 
 	unsigned char *max_instr;
@@ -111,7 +105,7 @@ static noinline int is_prefetch(struct p
 	if (error_code & PF_INSTR)
 		return 0;
 	
-	instr = (unsigned char *)convert_rip_to_linear(current, regs);
+	instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
 	max_instr = instr + 15;
 
 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
@@ -122,7 +116,7 @@ static noinline int is_prefetch(struct p
 		unsigned char instr_hi;
 		unsigned char instr_lo;
 
-		if (__get_user(opcode, instr))
+		if (__get_user(opcode, (char __user *)instr))
 			break; 
 
 		instr_hi = opcode & 0xf0; 
@@ -160,7 +154,7 @@ static noinline int is_prefetch(struct p
 		case 0x00:
 			/* Prefetch instruction is 0x0F0D or 0x0F18 */
 			scan_more = 0;
-			if (__get_user(opcode, instr)) 
+			if (__get_user(opcode, (char __user *)instr))
 				break;
 			prefetch = (instr_lo == 0xF) &&
 				(opcode == 0x0D || opcode == 0x18);
@@ -176,7 +170,7 @@ static noinline int is_prefetch(struct p
 static int bad_address(void *p) 
 { 
 	unsigned long dummy;
-	return __get_user(dummy, (unsigned long *)p);
+	return __get_user(dummy, (unsigned long __user *)p);
 } 
 
 void dump_pagetable(unsigned long address)
@@ -248,7 +242,7 @@ static int is_errata93(struct pt_regs *r
 
 int unhandled_signal(struct task_struct *tsk, int sig)
 {
-	if (tsk->pid == 1)
+	if (is_init(tsk))
 		return 1;
 	if (tsk->ptrace & PT_PTRACED)
 		return 0;
@@ -300,7 +294,7 @@ static int vmalloc_fault(unsigned long a
 	if (pgd_none(*pgd))
 		set_pgd(pgd, *pgd_ref);
 	else
-		BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
+		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 
 	/* Below here mismatches are bugs because these lower tables
 	   are shared */
@@ -309,7 +303,7 @@ static int vmalloc_fault(unsigned long a
 	pud_ref = pud_offset(pgd_ref, address);
 	if (pud_none(*pud_ref))
 		return -1;
-	if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
+	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
 		BUG();
 	pmd = pmd_offset(pud, address);
 	pmd_ref = pmd_offset(pud_ref, address);
@@ -531,7 +525,7 @@ good_area:
 		case PF_PROT:		/* read, present */
 			goto bad_area;
 		case 0:			/* read, not present */
-			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+			if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 				goto bad_area;
 	}
 
@@ -647,7 +641,7 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) { 
+	if (is_init(current)) {
 		yield();
 		goto again;
 	}
@@ -702,7 +696,7 @@ void vmalloc_sync_all(void)
 				if (pgd_none(*pgd))
 					set_pgd(pgd, *pgd_ref);
 				else
-					BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
+					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 			}
 			spin_unlock(&pgd_lock);
 			set_bit(pgd_index(address), insync);
Index: head-2008-07-15/arch/x86/mm/init_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/mm/init_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/mm/init_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -61,8 +61,6 @@ EXPORT_SYMBOL(__kernel_page_user);
 
 extern unsigned long *contiguous_bitmap;
 
-static unsigned long dma_reserve __initdata;
-
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 extern unsigned long start_pfn;
 
@@ -416,7 +414,6 @@ __init void *early_ioremap(unsigned long
 
 	/* actually usually some more */
 	if (size >= LARGE_PAGE_SIZE) {
-		printk("SMBIOS area too long %lu\n", size);
 		return NULL;
 	}
 	set_pmd(temp_mappings[0].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
@@ -438,13 +435,15 @@ __init void early_iounmap(void *addr, un
 #endif
 
 static void __meminit
-phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 {
-	int i, k;
+	int i = pmd_index(address);
 
-	for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
+	for (; i < PTRS_PER_PMD; i++) {
 		unsigned long pte_phys;
+		pmd_t *pmd = pmd_page + i;
 		pte_t *pte, *pte_save;
+		int k;
 
 		if (address >= end) {
 			if (!after_bootmem)
@@ -452,6 +451,12 @@ phys_pmd_init(pmd_t *pmd, unsigned long 
 					set_pmd(pmd, __pmd(0));
 			break;
 		}
+
+		if (__pmd_val(*pmd)) {
+			address += PMD_SIZE;
+			continue;
+		}
+
 		pte = alloc_static_page(&pte_phys);
 		pte_save = pte;
 		for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
@@ -474,40 +479,35 @@ phys_pmd_init(pmd_t *pmd, unsigned long 
 static void __meminit
 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
 {
-	pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
-
-	if (pmd_none(*pmd)) {
-		spin_lock(&init_mm.page_table_lock);
-		phys_pmd_init(pmd, address, end);
-		spin_unlock(&init_mm.page_table_lock);
-		__flush_tlb_all();
-	}
+	pmd_t *pmd = pmd_offset(pud,0);
+	spin_lock(&init_mm.page_table_lock);
+	phys_pmd_init(pmd, address, end);
+	spin_unlock(&init_mm.page_table_lock);
+	__flush_tlb_all();
 }
 
-static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 { 
-	long i = pud_index(address);
-
-	pud = pud + i;
-
-	if (after_bootmem && pud_val(*pud)) {
-		phys_pmd_update(pud, address, end);
-		return;
-	}
+	int i = pud_index(addr);
 
-	for (; i < PTRS_PER_PUD; pud++, i++) {
-		unsigned long paddr, pmd_phys;
+	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
+		unsigned long pmd_phys;
+		pud_t *pud = pud_page + pud_index(addr);
 		pmd_t *pmd;
 
-		paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
-		if (paddr >= end)
+		if (addr >= end)
 			break;
 
+		if (__pud_val(*pud)) {
+			phys_pmd_update(pud, addr, end);
+			continue;
+		}
+
 		pmd = alloc_static_page(&pmd_phys);
 		early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
 		spin_lock(&init_mm.page_table_lock);
 		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
-		phys_pmd_init(pmd, paddr, end);
+		phys_pmd_init(pmd, addr, end);
 		spin_unlock(&init_mm.page_table_lock);
 	}
 	__flush_tlb();
@@ -771,69 +771,18 @@ void __cpuinit zap_low_mappings(int cpu)
 #endif
 }
 
-/* Compute zone sizes for the DMA and DMA32 zones in a node. */
-__init void
-size_zones(unsigned long *z, unsigned long *h,
-	   unsigned long start_pfn, unsigned long end_pfn)
-{
- 	int i;
- 	unsigned long w;
-
- 	for (i = 0; i < MAX_NR_ZONES; i++)
- 		z[i] = 0;
-
- 	if (start_pfn < MAX_DMA_PFN)
- 		z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
- 	if (start_pfn < MAX_DMA32_PFN) {
- 		unsigned long dma32_pfn = MAX_DMA32_PFN;
- 		if (dma32_pfn > end_pfn)
- 			dma32_pfn = end_pfn;
- 		z[ZONE_DMA32] = dma32_pfn - start_pfn;
- 	}
- 	z[ZONE_NORMAL] = end_pfn - start_pfn;
-
- 	/* Remove lower zones from higher ones. */
- 	w = 0;
- 	for (i = 0; i < MAX_NR_ZONES; i++) {
- 		if (z[i])
- 			z[i] -= w;
- 	        w += z[i];
-	}
-
-	/* Compute holes */
-	w = start_pfn;
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		unsigned long s = w;
-		w += z[i];
-		h[i] = e820_hole_size(s, w);
-	}
-
-	/* Add the space pace needed for mem_map to the holes too. */
-	for (i = 0; i < MAX_NR_ZONES; i++)
-		h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
-
-	/* The 16MB DMA zone has the kernel and other misc mappings.
- 	   Account them too */
-	if (h[ZONE_DMA]) {
-		h[ZONE_DMA] += dma_reserve;
-		if (h[ZONE_DMA] >= z[ZONE_DMA]) {
-			printk(KERN_WARNING
-				"Kernel too large and filling up ZONE_DMA?\n");
-			h[ZONE_DMA] = z[ZONE_DMA];
-		}
-	}
-}
-
 #ifndef CONFIG_NUMA
 void __init paging_init(void)
 {
-	unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+	max_zone_pfns[ZONE_NORMAL] = end_pfn;
 
 	memory_present(0, 0, end_pfn);
 	sparse_init();
-	size_zones(zones, holes, 0, end_pfn);
-	free_area_init_node(0, NODE_DATA(0), zones,
-			    __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
+	free_area_init_nodes(max_zone_pfns);
 
 	init_mm.context.pinned = 1;
 }
@@ -887,36 +836,23 @@ void online_page(struct page *page)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
- * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
- *	via probe interface of sysfs. If acpi notifies hot-add event, then it
- *	can tell node id by searching dsdt. But, probe interface doesn't have
- *	node id. So, return 0 as node id at this time.
- */
-#ifdef CONFIG_NUMA
-int memory_add_physaddr_to_nid(u64 start)
-{
-	return 0;
-}
-#endif
-
-/*
  * Memory is added always to NORMAL zone. This means you will never get
  * additional DMA/DMA32 memory.
  */
 int arch_add_memory(int nid, u64 start, u64 size)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
-	struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
+	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
+	init_memory_mapping(start, (start + size -1));
+
 	ret = __add_pages(zone, start_pfn, nr_pages);
 	if (ret)
 		goto error;
 
-	init_memory_mapping(start, (start + size -1));
-
 	return ret;
 error:
 	printk("%s: Problem encountered in __add_pages!\n", __func__);
@@ -930,7 +866,17 @@ int remove_memory(u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(remove_memory);
 
-#else /* CONFIG_MEMORY_HOTPLUG */
+#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
+int memory_add_physaddr_to_nid(u64 start)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
 /*
  * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
  * just online the pages.
@@ -956,7 +902,7 @@ int __add_pages(struct zone *z, unsigned
 	}
 	return err;
 }
-#endif /* CONFIG_MEMORY_HOTPLUG */
+#endif
 
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
 			 kcore_vsyscall;
@@ -973,12 +919,6 @@ void __init mem_init(void)
 
 	pci_iommu_alloc();
 
-	/* How many end-of-memory variables you have, grandma! */
-	max_low_pfn = end_pfn;
-	max_pfn = end_pfn;
-	num_physpages = end_pfn;
-	high_memory = (void *) __va(end_pfn * PAGE_SIZE);
-
 	/* clear the zero-page */
 	memset(empty_zero_page, 0, PAGE_SIZE);
 
@@ -996,7 +936,8 @@ void __init mem_init(void)
 		init_page_count(pfn_to_page(pfn));
 		totalram_pages++;
 	}
-	reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
+	reservedpages = end_pfn - totalram_pages -
+					absent_pages_in_range(0, end_pfn);
 
 	after_bootmem = 1;
 
@@ -1103,15 +1044,34 @@ void free_initrd_mem(unsigned long start
 
 void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
 { 
-	/* Should check here against the e820 map to avoid double free */ 
 #ifdef CONFIG_NUMA
 	int nid = phys_to_nid(phys);
+#endif
+	unsigned long pfn = phys >> PAGE_SHIFT;
+	if (pfn >= end_pfn) {
+		/* This can happen with kdump kernels when accessing firmware
+		   tables. */
+		if (pfn < end_pfn_map)
+			return;
+		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
+				phys, len);
+		return;
+	}
+
+	/* Should check here against the e820 map to avoid double free */
+#ifdef CONFIG_NUMA
   	reserve_bootmem_node(NODE_DATA(nid), phys, len);
 #else       		
 	reserve_bootmem(phys, len);    
 #endif
-	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
+#ifndef CONFIG_XEN
+	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
+		static unsigned long dma_reserve __initdata;
+
 		dma_reserve += len / PAGE_SIZE;
+		set_dma_reserve(dma_reserve);
+	}
+#endif
 }
 
 int kern_addr_valid(unsigned long addr) 
Index: head-2008-07-15/arch/x86/mm/pageattr_64-xen.c
===================================================================
--- head-2008-07-15.orig/arch/x86/mm/pageattr_64-xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/arch/x86/mm/pageattr_64-xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -371,8 +371,8 @@ static void revert_page(unsigned long ad
 	BUG_ON(pud_none(*pud));
 	pmd = pmd_offset(pud, address);
 	BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
-	pgprot_val(ref_prot) |= _PAGE_PSE;
 	large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
+	large_pte = pte_mkhuge(large_pte);
 	set_pte((pte_t *)pmd, large_pte);
 }      
 
@@ -382,32 +382,28 @@ __change_page_attr(unsigned long address
 { 
 	pte_t *kpte; 
 	struct page *kpte_page;
-	unsigned kpte_flags;
 	pgprot_t ref_prot2;
 	kpte = lookup_address(address);
 	if (!kpte) return 0;
 	kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
-	kpte_flags = pte_val(*kpte); 
 	if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
-		if ((kpte_flags & _PAGE_PSE) == 0) { 
+		if (!pte_huge(*kpte)) {
 			set_pte(kpte, pfn_pte(pfn, prot));
 		} else {
  			/*
 			 * split_large_page will take the reference for this
 			 * change_page_attr on the split page.
  			 */
-
 			struct page *split;
-			ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
-
+			ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
 			split = split_large_page(address, prot, ref_prot2);
 			if (!split)
 				return -ENOMEM;
-			set_pte(kpte,mk_pte(split, ref_prot2));
+			set_pte(kpte, mk_pte(split, ref_prot2));
 			kpte_page = split;
-		}	
+		}
 		page_private(kpte_page)++;
-	} else if ((kpte_flags & _PAGE_PSE) == 0) { 
+	} else if (!pte_huge(*kpte)) {
 		set_pte(kpte, pfn_pte(pfn, ref_prot));
 		BUG_ON(page_private(kpte_page) == 0);
 		page_private(kpte_page)--;
@@ -464,10 +460,12 @@ int change_page_attr_addr(unsigned long 
 		 * lowmem */
 		if (__pa(address) < KERNEL_TEXT_SIZE) {
 			unsigned long addr2;
-			pgprot_t prot2 = prot;
+			pgprot_t prot2;
 			addr2 = __START_KERNEL_map + __pa(address);
- 			pgprot_val(prot2) &= ~_PAGE_NX;
-			err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
+			/* Make sure the kernel mappings stay executable */
+			prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
+			err = __change_page_attr(addr2, pfn, prot2,
+						 PAGE_KERNEL_EXEC);
 		} 
 	} 	
 	up_write(&init_mm.mmap_sem); 
Index: head-2008-07-15/drivers/char/tpm/tpm_xen.c
===================================================================
--- head-2008-07-15.orig/drivers/char/tpm/tpm_xen.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/char/tpm/tpm_xen.c	2008-07-15 14:24:17.000000000 +0200
@@ -85,8 +85,7 @@ static struct tpm_private *my_priv;
 
 /* local function prototypes */
 static irqreturn_t tpmif_int(int irq,
-                             void *tpm_priv,
-                             struct pt_regs *ptregs);
+                             void *tpm_priv);
 static void tpmif_rx_action(unsigned long unused);
 static int tpmif_connect(struct xenbus_device *dev,
                          struct tpm_private *tp,
@@ -559,7 +558,7 @@ static void tpmif_rx_action(unsigned lon
 }
 
 
-static irqreturn_t tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs)
+static irqreturn_t tpmif_int(int irq, void *tpm_priv)
 {
 	struct tpm_private *tp = tpm_priv;
 	unsigned long flags;
Index: head-2008-07-15/drivers/pci/Kconfig
===================================================================
--- head-2008-07-15.orig/drivers/pci/Kconfig	2008-01-24 23:58:37.000000000 +0100
+++ head-2008-07-15/drivers/pci/Kconfig	2008-07-15 14:24:17.000000000 +0200
@@ -45,7 +45,7 @@ config PCI_DEBUG
 config HT_IRQ
 	bool "Interrupts on hypertransport devices"
 	default y
-	depends on PCI && X86_LOCAL_APIC && X86_IO_APIC
+	depends on PCI && X86_LOCAL_APIC && X86_IO_APIC && !XEN
 	help
 	   This allows native hypertransport devices to use interrupts.
 
Index: head-2008-07-15/drivers/xen/Kconfig
===================================================================
--- head-2008-07-15.orig/drivers/xen/Kconfig	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/Kconfig	2008-07-15 14:24:17.000000000 +0200
@@ -265,6 +265,9 @@ endmenu
 config HAVE_IRQ_IGNORE_UNHANDLED
 	def_bool y
 
+config GENERIC_HARDIRQS_NO__DO_IRQ
+	def_bool y
+
 config NO_IDLE_HZ
 	def_bool y
 
Index: head-2008-07-15/drivers/xen/balloon/balloon.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/balloon/balloon.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/balloon/balloon.c	2008-07-15 14:24:17.000000000 +0200
@@ -84,7 +84,7 @@ static unsigned long frame_list[PAGE_SIZ
 /* VM /proc information for memory */
 extern unsigned long totalram_pages;
 
-#ifndef MODULE
+#if !defined(MODULE) && defined(CONFIG_HIGHMEM)
 extern unsigned long totalhigh_pages;
 #define inc_totalhigh_pages() (totalhigh_pages++)
 #define dec_totalhigh_pages() (totalhigh_pages--)
Index: head-2008-07-15/drivers/xen/blkback/blkback.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/blkback/blkback.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/blkback/blkback.c	2008-07-15 14:24:17.000000000 +0200
@@ -288,7 +288,7 @@ static void blkif_notify_work(blkif_t *b
 	wake_up(&blkif->wq);
 }
 
-irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+irqreturn_t blkif_be_int(int irq, void *dev_id)
 {
 	blkif_notify_work(dev_id);
 	return IRQ_HANDLED;
Index: head-2008-07-15/drivers/xen/blkback/common.h
===================================================================
--- head-2008-07-15.orig/drivers/xen/blkback/common.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/blkback/common.h	2008-07-15 14:24:17.000000000 +0200
@@ -130,7 +130,7 @@ void blkif_interface_init(void);
 
 void blkif_xenbus_init(void);
 
-irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+irqreturn_t blkif_be_int(int irq, void *dev_id);
 int blkif_schedule(void *arg);
 
 int blkback_barrier(struct xenbus_transaction xbt,
Index: head-2008-07-15/drivers/xen/blkfront/blkfront.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/blkfront/blkfront.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/blkfront/blkfront.c	2008-07-15 14:24:17.000000000 +0200
@@ -69,7 +69,7 @@ static int setup_blkring(struct xenbus_d
 
 static void kick_pending_request_queues(struct blkfront_info *);
 
-static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs);
+static irqreturn_t blkif_int(int irq, void *dev_id);
 static void blkif_restart_queue(void *arg);
 static void blkif_recover(struct blkfront_info *);
 static void blkif_completion(struct blk_shadow *);
@@ -698,7 +698,7 @@ void do_blkif_request(request_queue_t *r
 }
 
 
-static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
+static irqreturn_t blkif_int(int irq, void *dev_id)
 {
 	struct request *req;
 	blkif_response_t *bret;
Index: head-2008-07-15/drivers/xen/blktap/blktap.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/blktap/blktap.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/blktap/blktap.c	2008-07-15 14:24:17.000000000 +0200
@@ -1175,7 +1175,7 @@ static void blkif_notify_work(blkif_t *b
 	wake_up(&blkif->wq);
 }
 
-irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
 {
 	blkif_notify_work(dev_id);
 	return IRQ_HANDLED;
Index: head-2008-07-15/drivers/xen/blktap/common.h
===================================================================
--- head-2008-07-15.orig/drivers/xen/blktap/common.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/blktap/common.h	2008-07-15 14:24:17.000000000 +0200
@@ -112,7 +112,7 @@ void tap_blkif_interface_init(void);
 
 void tap_blkif_xenbus_init(void);
 
-irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+irqreturn_t tap_blkif_be_int(int irq, void *dev_id);
 int tap_blkif_schedule(void *arg);
 
 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif);
Index: head-2008-07-15/drivers/xen/console/console.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/console/console.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/console/console.c	2008-07-16 12:41:11.000000000 +0200
@@ -345,7 +345,7 @@ static struct tty_struct *xencons_tty;
 static int xencons_priv_irq;
 static char x_char;
 
-void xencons_rx(char *buf, unsigned len, struct pt_regs *regs)
+void xencons_rx(char *buf, unsigned len)
 {
 	int           i;
 	unsigned long flags;
@@ -370,8 +370,7 @@ void xencons_rx(char *buf, unsigned len,
 				if (time_before(jiffies, sysrq_timeout)) {
 					spin_unlock_irqrestore(
 						&xencons_lock, flags);
-					handle_sysrq(
-						buf[i], regs, xencons_tty);
+					handle_sysrq(buf[i], xencons_tty);
 					spin_lock_irqsave(
 						&xencons_lock, flags);
 					continue;
@@ -436,14 +435,13 @@ void xencons_tx(void)
 }
 
 /* Privileged receive callback and transmit kicker. */
-static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id,
-					  struct pt_regs *regs)
+static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id)
 {
 	static char rbuf[16];
 	int         l;
 
 	while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
-		xencons_rx(rbuf, l, regs);
+		xencons_rx(rbuf, l);
 
 	xencons_tx();
 
@@ -631,7 +629,7 @@ static void xencons_close(struct tty_str
 	spin_unlock_irqrestore(&xencons_lock, flags);
 }
 
-static struct tty_operations xencons_ops = {
+static const struct tty_operations xencons_ops = {
 	.open = xencons_open,
 	.close = xencons_close,
 	.write = xencons_write,
Index: head-2008-07-15/drivers/xen/console/xencons_ring.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/console/xencons_ring.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/console/xencons_ring.c	2008-07-15 14:24:17.000000000 +0200
@@ -83,7 +83,7 @@ int xencons_ring_send(const char *data, 
 	return sent;
 }
 
-static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs)
+static irqreturn_t handle_input(int irq, void *unused)
 {
 	struct xencons_interface *intf = xencons_interface();
 	XENCONS_RING_IDX cons, prod;
@@ -94,7 +94,7 @@ static irqreturn_t handle_input(int irq,
 	BUG_ON((prod - cons) > sizeof(intf->in));
 
 	while (cons != prod) {
-		xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs);
+		xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1);
 		cons++;
 	}
 
Index: head-2008-07-15/drivers/xen/core/evtchn.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/core/evtchn.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/core/evtchn.c	2008-07-15 14:24:17.000000000 +0200
@@ -507,7 +507,7 @@ static void unbind_from_irq(unsigned int
 
 int bind_caller_port_to_irqhandler(
 	unsigned int caller_port,
-	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	irq_handler_t handler,
 	unsigned long irqflags,
 	const char *devname,
 	void *dev_id)
@@ -530,7 +530,7 @@ EXPORT_SYMBOL_GPL(bind_caller_port_to_ir
 
 int bind_listening_port_to_irqhandler(
 	unsigned int remote_domain,
-	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	irq_handler_t handler,
 	unsigned long irqflags,
 	const char *devname,
 	void *dev_id)
@@ -554,7 +554,7 @@ EXPORT_SYMBOL_GPL(bind_listening_port_to
 int bind_interdomain_evtchn_to_irqhandler(
 	unsigned int remote_domain,
 	unsigned int remote_port,
-	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	irq_handler_t handler,
 	unsigned long irqflags,
 	const char *devname,
 	void *dev_id)
@@ -578,7 +578,7 @@ EXPORT_SYMBOL_GPL(bind_interdomain_evtch
 int bind_virq_to_irqhandler(
 	unsigned int virq,
 	unsigned int cpu,
-	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	irq_handler_t handler,
 	unsigned long irqflags,
 	const char *devname,
 	void *dev_id)
@@ -602,7 +602,7 @@ EXPORT_SYMBOL_GPL(bind_virq_to_irqhandle
 int bind_ipi_to_irqhandler(
 	unsigned int ipi,
 	unsigned int cpu,
-	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	irq_handler_t handler,
 	unsigned long irqflags,
 	const char *devname,
 	void *dev_id)
@@ -687,15 +687,7 @@ static unsigned int startup_dynirq(unsig
 	return 0;
 }
 
-static void shutdown_dynirq(unsigned int irq)
-{
-	int evtchn = evtchn_from_irq(irq);
-
-	if (VALID_EVTCHN(evtchn))
-		mask_evtchn(evtchn);
-}
-
-static void enable_dynirq(unsigned int irq)
+static void unmask_dynirq(unsigned int irq)
 {
 	int evtchn = evtchn_from_irq(irq);
 
@@ -703,7 +695,7 @@ static void enable_dynirq(unsigned int i
 		unmask_evtchn(evtchn);
 }
 
-static void disable_dynirq(unsigned int irq)
+static void mask_dynirq(unsigned int irq)
 {
 	int evtchn = evtchn_from_irq(irq);
 
@@ -731,12 +723,12 @@ static void end_dynirq(unsigned int irq)
 		unmask_evtchn(evtchn);
 }
 
-static struct hw_interrupt_type dynirq_type = {
-	.typename = "Dynamic-irq",
+static struct irq_chip dynirq_chip = {
+	.name     = "Dynamic-irq",
 	.startup  = startup_dynirq,
-	.shutdown = shutdown_dynirq,
-	.enable   = enable_dynirq,
-	.disable  = disable_dynirq,
+	.mask     = mask_dynirq,
+	.unmask   = unmask_dynirq,
+	.mask_ack = ack_dynirq,
 	.ack      = ack_dynirq,
 	.end      = end_dynirq,
 #ifdef CONFIG_SMP
@@ -820,12 +812,12 @@ static void shutdown_pirq(unsigned int i
 	irq_info[irq] = IRQ_UNBOUND;
 }
 
-static void enable_pirq(unsigned int irq)
+static void unmask_pirq(unsigned int irq)
 {
 	startup_pirq(irq);
 }
 
-static void disable_pirq(unsigned int irq)
+static void mask_pirq(unsigned int irq)
 {
 }
 
@@ -854,12 +846,14 @@ static void end_pirq(unsigned int irq)
 	}
 }
 
-static struct hw_interrupt_type pirq_type = {
+static struct irq_chip pirq_chip = {
+	.name     = "Phys-irq",
 	.typename = "Phys-irq",
 	.startup  = startup_pirq,
 	.shutdown = shutdown_pirq,
-	.enable   = enable_pirq,
-	.disable  = disable_pirq,
+	.mask     = mask_pirq,
+	.unmask   = unmask_pirq,
+	.mask_ack = ack_pirq,
 	.ack      = ack_pirq,
 	.end      = end_pirq,
 #ifdef CONFIG_SMP
@@ -1043,7 +1037,8 @@ void __init xen_init_IRQ(void)
 		irq_desc[dynirq_to_irq(i)].status = IRQ_DISABLED;
 		irq_desc[dynirq_to_irq(i)].action = NULL;
 		irq_desc[dynirq_to_irq(i)].depth = 1;
-		irq_desc[dynirq_to_irq(i)].chip = &dynirq_type;
+		set_irq_chip_and_handler_name(dynirq_to_irq(i), &dynirq_chip,
+		                              handle_level_irq, "level");
 	}
 
 	/* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
@@ -1059,6 +1054,7 @@ void __init xen_init_IRQ(void)
 		irq_desc[pirq_to_irq(i)].status = IRQ_DISABLED;
 		irq_desc[pirq_to_irq(i)].action = NULL;
 		irq_desc[pirq_to_irq(i)].depth = 1;
-		irq_desc[pirq_to_irq(i)].chip = &pirq_type;
+		set_irq_chip_and_handler_name(pirq_to_irq(i), &pirq_chip,
+		                              handle_level_irq, "level");
 	}
 }
Index: head-2008-07-15/drivers/xen/core/reboot.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/core/reboot.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/core/reboot.c	2008-07-15 14:24:17.000000000 +0200
@@ -13,6 +13,7 @@
 
 #ifdef HAVE_XEN_PLATFORM_COMPAT_H
 #include <xen/platform-compat.h>
+#undef handle_sysrq
 #endif
 
 MODULE_LICENSE("Dual BSD/GPL");
@@ -203,7 +204,7 @@ static void sysrq_handler(struct xenbus_
 
 #ifdef CONFIG_MAGIC_SYSRQ
 	if (sysrq_key != '\0')
-		handle_sysrq(sysrq_key, NULL, NULL);
+		handle_sysrq(sysrq_key, NULL);
 #endif
 }
 
Index: head-2008-07-15/drivers/xen/core/smpboot.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/core/smpboot.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/core/smpboot.c	2008-07-15 14:24:17.000000000 +0200
@@ -25,8 +25,8 @@
 #include <xen/cpu_hotplug.h>
 #include <xen/xenbus.h>
 
-extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
-extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
+extern irqreturn_t smp_reschedule_interrupt(int, void *);
+extern irqreturn_t smp_call_function_interrupt(int, void *);
 
 extern int local_setup_timer(unsigned int cpu);
 extern void local_teardown_timer(unsigned int cpu);
@@ -66,8 +66,6 @@ EXPORT_SYMBOL(cpu_core_map);
 #if defined(__i386__)
 u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
-#elif !defined(CONFIG_X86_IO_APIC)
-unsigned int maxcpus = NR_CPUS;
 #endif
 
 void __init prefill_possible_map(void)
Index: head-2008-07-15/drivers/xen/fbfront/xenfb.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/fbfront/xenfb.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/fbfront/xenfb.c	2008-07-15 14:24:17.000000000 +0200
@@ -523,8 +523,7 @@ static struct fb_ops xenfb_fb_ops = {
 	.fb_set_par     = xenfb_set_par,
 };
 
-static irqreturn_t xenfb_event_handler(int rq, void *dev_id,
-				       struct pt_regs *regs)
+static irqreturn_t xenfb_event_handler(int rq, void *dev_id)
 {
 	/*
 	 * No in events recognized, simply ignore them all.
Index: head-2008-07-15/drivers/xen/fbfront/xenkbd.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/fbfront/xenkbd.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/fbfront/xenkbd.c	2008-07-15 14:24:17.000000000 +0200
@@ -46,7 +46,7 @@ static void xenkbd_disconnect_backend(st
  * to do that.
  */
 
-static irqreturn_t input_handler(int rq, void *dev_id, struct pt_regs *regs)
+static irqreturn_t input_handler(int rq, void *dev_id)
 {
 	struct xenkbd_info *info = dev_id;
 	struct xenkbd_page *page = info->page;
Index: head-2008-07-15/drivers/xen/gntdev/gntdev.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/gntdev/gntdev.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/gntdev/gntdev.c	2008-07-15 14:24:17.000000000 +0200
@@ -755,9 +755,6 @@ static pte_t gntdev_clear_pte(struct vm_
 		BUG();
 	}
 
-	/* Copy the existing value of the PTE for returning. */
-	copy = *ptep;
-
 	/* Calculate the grant relating to this PTE. */
 	slot_index = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
 
@@ -772,6 +769,10 @@ static pte_t gntdev_clear_pte(struct vm_
 		    GNTDEV_INVALID_HANDLE && 
 		    !xen_feature(XENFEAT_auto_translated_physmap)) {
 			/* NOT USING SHADOW PAGE TABLES. */
+
+			/* Copy the existing value of the PTE for returning. */
+			copy = *ptep;
+
 			gnttab_set_unmap_op(&op, virt_to_machine(ptep), 
 					    GNTMAP_contains_pte,
 					    private_data->grants[slot_index]
@@ -784,7 +785,7 @@ static pte_t gntdev_clear_pte(struct vm_
 				       op.status);
 		} else {
 			/* USING SHADOW PAGE TABLES. */
-			pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
+			copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
 		}
 
 		/* Finally, we unmap the grant from kernel space. */
@@ -812,7 +813,7 @@ static pte_t gntdev_clear_pte(struct vm_
 				    >> PAGE_SHIFT, INVALID_P2M_ENTRY);
 
 	} else {
-		pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
+		copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
 	}
 
 	return copy;
Index: head-2008-07-15/drivers/xen/netback/accel.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/netback/accel.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/netback/accel.c	2008-07-15 14:24:17.000000000 +0200
@@ -65,7 +65,7 @@ static int match_accelerator(struct xenb
 	
 	if (IS_ERR(eth_name)) {
 		/* Probably means not present */
-		DPRINTK("%s: no match due to xenbus_read accel error %d\n", 
+		DPRINTK("%s: no match due to xenbus_read accel error %ld\n",
 			__FUNCTION__, PTR_ERR(eth_name));
 		return 0;
 	} else {
Index: head-2008-07-15/drivers/xen/netback/common.h
===================================================================
--- head-2008-07-15.orig/drivers/xen/netback/common.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/netback/common.h	2008-07-15 14:24:17.000000000 +0200
@@ -200,7 +200,7 @@ void netif_deschedule_work(netif_t *neti
 
 int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
 struct net_device_stats *netif_be_get_stats(struct net_device *dev);
-irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+irqreturn_t netif_be_int(int irq, void *dev_id);
 
 static inline int netbk_can_queue(struct net_device *dev)
 {
Index: head-2008-07-15/drivers/xen/netback/loopback.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/netback/loopback.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/netback/loopback.c	2008-07-15 14:24:17.000000000 +0200
@@ -151,7 +151,7 @@ static int loopback_start_xmit(struct sk
 	np->stats.rx_bytes += skb->len;
 	np->stats.rx_packets++;
 
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		/* Defer checksum calculation. */
 		skb->proto_csum_blank = 1;
 		/* Must be a local packet: assert its integrity. */
Index: head-2008-07-15/drivers/xen/netback/netback.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/netback/netback.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/netback/netback.c	2008-07-15 14:24:17.000000000 +0200
@@ -677,7 +677,7 @@ static void net_rx_action(unsigned long 
 		id = meta[npo.meta_cons].id;
 		flags = nr_frags ? NETRXF_more_data : 0;
 
-		if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
+		if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
 			flags |= NETRXF_csum_blank | NETRXF_data_validated;
 		else if (skb->proto_data_valid) /* remote but checksummed? */
 			flags |= NETRXF_data_validated;
@@ -1441,7 +1441,7 @@ static void netif_page_release(struct pa
 	netif_idx_release(netif_page_index(page));
 }
 
-irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+irqreturn_t netif_be_int(int irq, void *dev_id)
 {
 	netif_t *netif = dev_id;
 
@@ -1508,7 +1508,7 @@ static netif_rx_response_t *make_rx_resp
 }
 
 #ifdef NETBE_DEBUG_INTERRUPT
-static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
+static irqreturn_t netif_be_dbg(int irq, void *dev_id)
 {
 	struct list_head *ent;
 	netif_t *netif;
Index: head-2008-07-15/drivers/xen/netfront/netfront.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/netfront/netfront.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/netfront/netfront.c	2008-07-15 14:24:17.000000000 +0200
@@ -136,7 +136,7 @@ static inline int netif_needs_gso(struct
 {
         return skb_is_gso(skb) &&
                (!skb_gso_ok(skb, dev->features) ||
-                unlikely(skb->ip_summed != CHECKSUM_HW));
+                unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
 }
 #else
 #define HAVE_GSO			0
@@ -222,7 +222,7 @@ static void network_tx_buf_gc(struct net
 static void network_alloc_rx_buffers(struct net_device *);
 static void send_fake_arp(struct net_device *);
 
-static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs);
+static irqreturn_t netif_int(int irq, void *dev_id);
 
 #ifdef CONFIG_SYSFS
 static int xennet_sysfs_addif(struct net_device *netdev);
@@ -992,7 +992,7 @@ static int network_start_xmit(struct sk_
 	tx->flags = 0;
 	extra = NULL;
 
-	if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
 		tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
 #ifdef CONFIG_XEN
 	if (skb->proto_data_valid) /* remote but checksummed? */
@@ -1049,7 +1049,7 @@ static int network_start_xmit(struct sk_
 	return 0;
 }
 
-static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
+static irqreturn_t netif_int(int irq, void *dev_id)
 {
 	struct net_device *dev = dev_id;
 	struct netfront_info *np = netdev_priv(dev);
Index: head-2008-07-15/drivers/xen/pciback/pciback.h
===================================================================
--- head-2008-07-15.orig/drivers/xen/pciback/pciback.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/pciback/pciback.h	2008-07-15 14:24:17.000000000 +0200
@@ -87,7 +87,7 @@ int pciback_publish_pci_roots(struct pci
 void pciback_release_devices(struct pciback_device *pdev);
 
 /* Handles events from front-end */
-irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs);
+irqreturn_t pciback_handle_event(int irq, void *dev_id);
 void pciback_do_op(void *data);
 
 int pciback_xenbus_register(void);
Index: head-2008-07-15/drivers/xen/pciback/pciback_ops.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/pciback/pciback_ops.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/pciback/pciback_ops.c	2008-07-15 14:24:17.000000000 +0200
@@ -85,7 +85,7 @@ void pciback_do_op(void *data)
 	test_and_schedule_op(pdev);
 }
 
-irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs)
+irqreturn_t pciback_handle_event(int irq, void *dev_id)
 {
 	struct pciback_device *pdev = dev_id;
 
Index: head-2008-07-15/drivers/xen/pcifront/pci_op.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/pcifront/pci_op.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/pcifront/pci_op.c	2008-07-15 14:24:17.000000000 +0200
@@ -392,10 +392,16 @@ int __devinit pcifront_rescan_root(struc
 
 		d = pci_scan_single_device(b, devfn);
 		if (d) {
+			int err;
+
 			dev_info(&pdev->xdev->dev, "New device on "
 				 "%04x:%02x:%02x.%02x found.\n", domain, bus,
 				 PCI_SLOT(devfn), PCI_FUNC(devfn));
-			pci_bus_add_device(d);
+			err = pci_bus_add_device(d);
+			if (err)
+				dev_err(&pdev->xdev->dev,
+				        "error %d adding device, continuing.\n",
+					err);
 		}
 	}
 
Index: head-2008-07-15/drivers/xen/privcmd/compat_privcmd.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/privcmd/compat_privcmd.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/privcmd/compat_privcmd.c	2008-07-15 14:24:17.000000000 +0200
@@ -18,7 +18,6 @@
  * Authors: Jimi Xenidis <jimix@watson.ibm.com>
  */
 
-#include <linux/config.h>
 #include <linux/compat.h>
 #include <linux/ioctl.h>
 #include <linux/syscalls.h>
Index: head-2008-07-15/drivers/xen/privcmd/privcmd.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/privcmd/privcmd.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/privcmd/privcmd.c	2008-07-15 14:24:17.000000000 +0200
@@ -236,7 +236,7 @@ static int privcmd_mmap(struct file * fi
 #endif
 
 	/* DONTCOPY is essential for Xen as copy_page_range is broken. */
-	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
+	vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTCOPY;
 	vma->vm_ops = &privcmd_vm_ops;
 	vma->vm_private_data = NULL;
 
Index: head-2008-07-15/drivers/xen/sfc_netback/accel_xenbus.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/sfc_netback/accel_xenbus.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/sfc_netback/accel_xenbus.c	2008-07-15 14:24:17.000000000 +0200
@@ -68,8 +68,7 @@ static void unlink_bend(struct netback_a
 
 
 /* Demultiplex a message IRQ from the frontend driver.  */
-static irqreturn_t msgirq_from_frontend(int irq, void *context, 
-				     struct pt_regs *unused)
+static irqreturn_t msgirq_from_frontend(int irq, void *context)
 {
 	struct xenbus_device *dev = context;
 	struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
@@ -84,8 +83,7 @@ static irqreturn_t msgirq_from_frontend(
  * functionally, but we need it to pass to the bind function, and may
  * get called spuriously
  */
-static irqreturn_t netirq_from_frontend(int irq, void *context, 
-					struct pt_regs *unused)
+static irqreturn_t netirq_from_frontend(int irq, void *context)
 {
 	VPRINTK("netirq %d from device %s\n", irq,
 		((struct xenbus_device *)context)->nodename);
Index: head-2008-07-15/drivers/xen/sfc_netfront/accel.h
===================================================================
--- head-2008-07-15.orig/drivers/xen/sfc_netfront/accel.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/sfc_netfront/accel.h	2008-07-15 14:24:17.000000000 +0200
@@ -449,10 +449,8 @@ void netfront_accel_msg_tx_fastpath(netf
 				    u32 ip, u16 port, u8 protocol);
 
 /* Process an IRQ received from back end driver */
-irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context, 
-						     struct pt_regs *unused);
-irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context, 
-						     struct pt_regs *unused);
+irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context);
+irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context);
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
 extern void netfront_accel_msg_from_bend(struct work_struct *context);
Index: head-2008-07-15/drivers/xen/sfc_netfront/accel_msg.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/sfc_netfront/accel_msg.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/sfc_netfront/accel_msg.c	2008-07-15 14:24:17.000000000 +0200
@@ -490,8 +490,7 @@ void netfront_accel_msg_from_bend(void *
 }
 
 
-irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context, 
-						 struct pt_regs *unused)
+irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context)
 {
 	netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
 	VPRINTK("irq %d from device %s\n", irq, vnic->dev->nodename);
@@ -502,8 +501,7 @@ irqreturn_t netfront_accel_msg_channel_i
 }
 
 /* Process an interrupt received from the NIC via backend */
-irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context, 
-						     struct pt_regs *unused)
+irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context)
 {
 	netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
 	struct net_device *net_dev = vnic->net_dev;
Index: head-2008-07-15/drivers/xen/sfc_netfront/accel_tso.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/sfc_netfront/accel_tso.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/sfc_netfront/accel_tso.c	2008-07-15 14:24:17.000000000 +0200
@@ -363,7 +363,7 @@ int netfront_accel_enqueue_skb_tso(netfr
 
 	tso_check_safe(skb);
 
-	if (skb->ip_summed != CHECKSUM_HW)
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
 		EPRINTK("Trying to TSO send a packet without HW checksum\n");
 
 	tso_start(&state, skb);
Index: head-2008-07-15/drivers/xen/sfc_netfront/accel_vi.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/sfc_netfront/accel_vi.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/sfc_netfront/accel_vi.c	2008-07-15 14:24:17.000000000 +0200
@@ -461,7 +461,7 @@ netfront_accel_enqueue_skb_multi(netfron
 
 	frag_i = -1;
 
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		/* Set to zero to encourage falcon to work it out for us */
 		*(u16*)(skb->h.raw + skb->csum) = 0;
 	}
@@ -580,7 +580,7 @@ netfront_accel_enqueue_skb_single(netfro
 	
 	kva = buf->pkt_kva;
 
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		/* Set to zero to encourage falcon to work it out for us */
 		*(u16*)(skb->h.raw + skb->csum) = 0;
 	}
Index: head-2008-07-15/drivers/xen/tpmback/common.h
===================================================================
--- head-2008-07-15.orig/drivers/xen/tpmback/common.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/tpmback/common.h	2008-07-15 14:24:17.000000000 +0200
@@ -61,7 +61,7 @@ void tpmif_deschedule_work(tpmif_t * tpm
 void tpmif_xenbus_init(void);
 void tpmif_xenbus_exit(void);
 int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
-irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+irqreturn_t tpmif_be_int(int irq, void *dev_id);
 
 long int tpmback_get_instance(struct backend_info *bi);
 
Index: head-2008-07-15/drivers/xen/tpmback/tpmback.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/tpmback/tpmback.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/tpmback/tpmback.c	2008-07-15 14:24:17.000000000 +0200
@@ -502,7 +502,7 @@ static ssize_t vtpm_op_read(struct file 
 		list_del(&pak->next);
 		write_unlock_irqrestore(&dataex.pak_lock, flags);
 
-		DPRINTK("size given by app: %d, available: %d\n", size, left);
+		DPRINTK("size given by app: %zu, available: %u\n", size, left);
 
 		ret_size = min_t(size_t, size, left);
 
@@ -899,7 +899,7 @@ static void tpm_tx_action(unsigned long 
 	}
 }
 
-irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+irqreturn_t tpmif_be_int(int irq, void *dev_id)
 {
 	tpmif_t *tpmif = (tpmif_t *) dev_id;
 
Index: head-2008-07-15/drivers/xen/xenbus/xenbus_comms.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/xenbus/xenbus_comms.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/xenbus/xenbus_comms.c	2008-07-15 14:24:17.000000000 +0200
@@ -55,7 +55,7 @@ static DECLARE_WORK(probe_work, xenbus_p
 
 static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
 
-static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs)
+static irqreturn_t wake_waiting(int irq, void *unused)
 {
 	if (unlikely(xenstored_ready == 0)) {
 		xenstored_ready = 1;
Index: head-2008-07-15/drivers/xen/xenoprof/xenoprofile.c
===================================================================
--- head-2008-07-15.orig/drivers/xen/xenoprof/xenoprofile.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/drivers/xen/xenoprof/xenoprofile.c	2008-07-15 14:24:17.000000000 +0200
@@ -195,7 +195,7 @@ done:
 }
 
 static irqreturn_t 
-xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs)
+xenoprof_ovf_interrupt(int irq, void * dev_id)
 {
 	struct xenoprof_buf * buf;
 	static unsigned long flag;
Index: head-2008-07-15/include/asm-generic/pgtable.h
===================================================================
--- head-2008-07-15.orig/include/asm-generic/pgtable.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-generic/pgtable.h	2008-07-15 14:24:17.000000000 +0200
@@ -100,7 +100,7 @@ static inline void ptep_set_wrprotect(st
 #endif
 
 #ifndef arch_change_pte_range
-#define arch_change_pte_range(mm, pmd, addr, end, newprot) 0
+#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
 #endif
 
 #ifndef __HAVE_ARCH_PTE_SAME
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/desc_32.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/desc_32.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/desc_32.h	2008-07-15 14:24:17.000000000 +0200
@@ -32,52 +32,110 @@ static inline struct desc_struct *get_cp
 	return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
 }
 
+/*
+ * This is the ldt that every process will get unless we need
+ * something other than this.
+ */
+extern struct desc_struct default_ldt[];
+extern struct desc_struct idt_table[];
+extern void set_intr_gate(unsigned int irq, void * addr);
+
+static inline void pack_descriptor(__u32 *a, __u32 *b,
+	unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
+{
+	*a = ((base & 0xffff) << 16) | (limit & 0xffff);
+	*b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+		(limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
+}
+
+static inline void pack_gate(__u32 *a, __u32 *b,
+	unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
+{
+	*a = (seg << 16) | (base & 0xffff);
+	*b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
+}
+
+#define DESCTYPE_LDT 	0x82	/* present, system, DPL-0, LDT */
+#define DESCTYPE_TSS 	0x89	/* present, system, DPL-0, 32-bit TSS */
+#define DESCTYPE_TASK	0x85	/* present, system, DPL-0, task gate */
+#define DESCTYPE_INT	0x8e	/* present, system, DPL-0, interrupt gate */
+#define DESCTYPE_TRAP	0x8f	/* present, system, DPL-0, trap gate */
+#define DESCTYPE_DPL3	0x60	/* DPL-3 */
+#define DESCTYPE_S	0x10	/* !system */
+
 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
 #define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
 
 #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
 #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
-#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
-#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
+#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
 
 #define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
 #define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
-#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
-#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
+#define store_tr(tr) __asm__ ("str %0":"=m" (tr))
+#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
 
-/*
- * This is the ldt that every process will get unless we need
- * something other than this.
- */
-extern struct desc_struct default_ldt[];
-extern void set_intr_gate(unsigned int irq, void * addr);
+#if TLS_SIZE != 24
+# error update this code.
+#endif
+
+static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
+					      *(u64 *)&t->tls_array[i]) \
+		BUG()
+	C(0); C(1); C(2);
+#undef C
+}
 
-#define _set_tssldt_desc(n,addr,limit,type) \
-__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
-	"movw %w1,2(%2)\n\t" \
-	"rorl $16,%1\n\t" \
-	"movb %b1,4(%2)\n\t" \
-	"movb %4,5(%2)\n\t" \
-	"movb $0,6(%2)\n\t" \
-	"movb %h1,7(%2)\n\t" \
-	"rorl $16,%1" \
-	: "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
+#ifndef CONFIG_XEN
+static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
+{
+	__u32 *lp = (__u32 *)((char *)dt + entry*8);
+	*lp = entry_a;
+	*(lp+1) = entry_b;
+}
 
-#ifndef CONFIG_X86_NO_TSS
-static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
+#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+#else
+extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
+extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
+#endif
+#ifndef CONFIG_X86_NO_IDT
+#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+
+static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
 {
-	_set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
-		offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
+	__u32 a, b;
+	pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
+	write_idt_entry(idt_table, gate, a, b);
 }
+#endif
 
-#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+#ifndef CONFIG_X86_NO_TSS
+static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
+{
+	__u32 a, b;
+	pack_descriptor(&a, &b, (unsigned long)addr,
+			offsetof(struct tss_struct, __cacheline_filler) - 1,
+			DESCTYPE_TSS, 0);
+	write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
+}
 #endif
 
-static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
+static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries)
 {
-	_set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
+	__u32 a, b;
+	pack_descriptor(&a, &b, (unsigned long)addr,
+			entries * sizeof(struct desc_struct) - 1,
+			DESCTYPE_LDT, 0);
+	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
 }
 
+#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
 #define LDT_entry_a(info) \
 	((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
 
@@ -103,21 +161,6 @@ static inline void set_ldt_desc(unsigned
 	(info)->seg_not_present	== 1	&& \
 	(info)->useable		== 0	)
 
-extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
-
-#if TLS_SIZE != 24
-# error update this code.
-#endif
-
-static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
-{
-#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
-					       *(u64 *)&t->tls_array[i])) \
-		BUG();
-	C(0); C(1); C(2);
-#undef C
-}
-
 static inline void clear_LDT(void)
 {
 	int cpu = get_cpu();
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/fixmap_32.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/fixmap_32.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/fixmap_32.h	2008-07-15 14:24:17.000000000 +0200
@@ -55,7 +55,7 @@ enum fixed_addresses {
 #ifdef CONFIG_X86_LOCAL_APIC
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 #endif
-#ifdef CONFIG_X86_IO_APIC
+#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
 #endif
@@ -95,10 +95,9 @@ enum fixed_addresses {
 	__end_of_fixed_addresses
 };
 
-extern void set_fixaddr_top(unsigned long top);
-
 extern void __set_fixmap(enum fixed_addresses idx,
 					maddr_t phys, pgprot_t flags);
+extern void reserve_top_address(unsigned long reserve);
 
 #define set_fixmap(idx, phys) \
 		__set_fixmap(idx, phys, PAGE_KERNEL)
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/hw_irq_32.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/hw_irq_32.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/hw_irq_32.h	2008-07-15 14:24:17.000000000 +0200
@@ -17,8 +17,6 @@
 #include <asm/irq.h>
 #include <asm/sections.h>
 
-struct hw_interrupt_type;
-
 #define NMI_VECTOR		0x02
 
 /*
@@ -28,10 +26,6 @@ struct hw_interrupt_type;
  * Interrupt entry/exit code at both C and assembly level
  */
 
-extern u8 irq_vector[NR_IRQ_VECTORS];
-#define IO_APIC_VECTOR(irq)	(irq_vector[irq])
-#define AUTO_ASSIGN		-1
-
 extern void (*interrupt[NR_IRQS])(void);
 
 #ifdef CONFIG_SMP
@@ -44,7 +38,7 @@ fastcall void call_function_interrupt(vo
 fastcall void apic_timer_interrupt(void);
 fastcall void error_interrupt(void);
 fastcall void spurious_interrupt(void);
-fastcall void thermal_interrupt(struct pt_regs *);
+fastcall void thermal_interrupt(void);
 #define platform_legacy_irq(irq)	((irq) < 16)
 #endif
 
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/io_32.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/io_32.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/io_32.h	2008-07-15 14:24:17.000000000 +0200
@@ -237,33 +237,6 @@ static inline void memcpy_toio(volatile 
 
 #define eth_io_copy_and_sum(a,b,c,d)		eth_copy_and_sum((a),(void __force *)(b),(c),(d))
 
-/**
- *	check_signature		-	find BIOS signatures
- *	@io_addr: mmio address to check 
- *	@signature:  signature block
- *	@length: length of signature
- *
- *	Perform a signature comparison with the mmio address io_addr. This
- *	address should have been obtained by ioremap.
- *	Returns 1 on a match.
- */
- 
-static inline int check_signature(volatile void __iomem * io_addr,
-	const unsigned char *signature, int length)
-{
-	int retval = 0;
-	do {
-		if (readb(io_addr) != *signature)
-			goto out;
-		io_addr++;
-		signature++;
-		length--;
-	} while (length);
-	retval = 1;
-out:
-	return retval;
-}
-
 /*
  *	Cache management
  *
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/pgtable-3level.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/pgtable-3level.h	2008-07-15 14:24:17.000000000 +0200
@@ -53,7 +53,6 @@ static inline int pte_exec_kernel(pte_t 
  * not possible, use pte_get_and_clear to obtain the old pte
  * value and then use set_pte to update it.  -ben
  */
-#define __HAVE_ARCH_SET_PTE_ATOMIC
 
 static inline void set_pte(pte_t *ptep, pte_t pte)
 {
@@ -70,14 +69,6 @@ static inline void set_pte(pte_t *ptep, 
 		set_pte((ptep), (pteval));				\
 } while (0)
 
-#define set_pte_at_sync(_mm,addr,ptep,pteval) do {			\
-	if (((_mm) != current->mm && (_mm) != &init_mm) ||		\
-	    HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
-		set_pte((ptep), (pteval));				\
-		xen_invlpg((addr));					\
-	}								\
-} while (0)
-
 #define set_pmd(pmdptr,pmdval)				\
 		xen_l2_entry_update((pmdptr), (pmdval))
 #define set_pud(pudptr,pudval) \
@@ -94,7 +85,7 @@ static inline void pud_clear (pud_t * pu
 #define pud_page(pud) \
 ((struct page *) __va(pud_val(pud) & PAGE_MASK))
 
-#define pud_page_kernel(pud) \
+#define pud_page_vaddr(pud) \
 ((unsigned long) __va(pud_val(pud) & PAGE_MASK))
 
 
@@ -124,6 +115,7 @@ static inline void pte_clear(struct mm_s
 
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
 
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	pte_t pte = *ptep;
@@ -142,6 +134,7 @@ static inline pte_t ptep_get_and_clear(s
 	return pte;
 }
 
+#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
 #define ptep_clear_flush(vma, addr, ptep)			\
 ({								\
 	pte_t *__ptep = (ptep);					\
@@ -159,6 +152,7 @@ static inline pte_t ptep_get_and_clear(s
 	__res;							\
 })
 
+#define __HAVE_ARCH_PTE_SAME
 static inline int pte_same(pte_t a, pte_t b)
 {
 	return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/pgtable_32.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/pgtable_32.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/pgtable_32.h	2008-07-15 14:24:17.000000000 +0200
@@ -260,31 +260,89 @@ static inline pte_t pte_mkhuge(pte_t pte
 # include <asm/pgtable-2level.h>
 #endif
 
-#define ptep_test_and_clear_dirty(vma, addr, ptep)			\
+/*
+ * Rules for using pte_update - it must be called after any PTE update which
+ * has not been done using the set_pte / clear_pte interfaces.  It is used by
+ * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
+ * updates should either be sets, clears, or set_pte_atomic for P->P
+ * transitions, which means this hook should only be called for user PTEs.
+ * This hook implies a P->P protection or access change has taken place, which
+ * requires a subsequent TLB flush.  The notification can optionally be delayed
+ * until the TLB flush event by using the pte_update_defer form of the
+ * interface, but care must be taken to assure that the flush happens while
+ * still holding the same page table lock so that the shadow and primary pages
+ * do not become out of sync on SMP.
+ */
+#define pte_update(mm, addr, ptep)		do { } while (0)
+#define pte_update_defer(mm, addr, ptep)	do { } while (0)
+
+
+/*
+ * We only update the dirty/accessed state if we set
+ * the dirty bit by hand in the kernel, since the hardware
+ * will do the accessed bit for us, and we don't want to
+ * race with other CPU's that might be updating the dirty
+ * bit at the same time.
+ */
+#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+#define ptep_set_access_flags(vma, address, ptep, entry, dirty)		\
+do {									\
+	if (dirty)							\
+		ptep_establish(vma, address, ptep, entry);		\
+} while (0)
+
+/*
+ * We don't actually have these, but we want to advertise them so that
+ * we can encompass the flush here.
+ */
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+
+/*
+ * Rules for using ptep_establish: the pte MUST be a user pte, and
+ * must be a present->present transition.
+ */
+#define __HAVE_ARCH_PTEP_ESTABLISH
+#define ptep_establish(vma, address, ptep, pteval)			\
+do {									\
+	if ( likely((vma)->vm_mm == current->mm) ) {			\
+		BUG_ON(HYPERVISOR_update_va_mapping(address,		\
+			pteval,						\
+			(unsigned long)(vma)->vm_mm->cpu_vm_mask.bits|	\
+				UVMF_INVLPG|UVMF_MULTI));		\
+	} else {							\
+		xen_l1_entry_update(ptep, pteval);			\
+		flush_tlb_page(vma, address);				\
+	}								\
+} while (0)
+
+#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
+#define ptep_clear_flush_dirty(vma, address, ptep)			\
 ({									\
 	pte_t __pte = *(ptep);						\
-	int __ret = pte_dirty(__pte);					\
-	if (__ret) {							\
-		__pte = pte_mkclean(__pte);				\
-		if ((vma)->vm_mm != current->mm ||			\
-		    HYPERVISOR_update_va_mapping(addr, __pte, 0))	\
-			(ptep)->pte_low = __pte.pte_low;		\
-	}								\
-	__ret;								\
+	int __dirty = pte_dirty(__pte);					\
+	__pte = pte_mkclean(__pte);					\
+	if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
+		ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
+	else if (__dirty)						\
+		(ptep)->pte_low = __pte.pte_low;			\
+	__dirty;							\
 })
 
-#define ptep_test_and_clear_young(vma, addr, ptep)			\
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young(vma, address, ptep)			\
 ({									\
 	pte_t __pte = *(ptep);						\
-	int __ret = pte_young(__pte);					\
-	if (__ret)							\
-		__pte = pte_mkold(__pte);				\
-		if ((vma)->vm_mm != current->mm ||			\
-		    HYPERVISOR_update_va_mapping(addr, __pte, 0))	\
-			(ptep)->pte_low = __pte.pte_low;		\
-	__ret;								\
+	int __young = pte_young(__pte);					\
+	__pte = pte_mkold(__pte);					\
+	if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
+		ptep_set_access_flags(vma, address, ptep, __pte, __young); \
+	else if (__young)						\
+		(ptep)->pte_low = __pte.pte_low;			\
+	__young;							\
 })
 
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
 #define ptep_get_and_clear_full(mm, addr, ptep, full)			\
 	((full) ? ({							\
 		pte_t __res = *(ptep);					\
@@ -296,6 +354,7 @@ static inline pte_t pte_mkhuge(pte_t pte
 	 }) :								\
 	 ptep_get_and_clear(mm, addr, ptep))
 
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	pte_t pte = *ptep;
@@ -391,11 +450,11 @@ static inline pte_t pte_modify(pte_t pte
 #define pte_index(address) \
 		(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset_kernel(dir, address) \
-	((pte_t *) pmd_page_kernel(*(dir)) +  pte_index(address))
+	((pte_t *) pmd_page_vaddr(*(dir)) +  pte_index(address))
 
 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
 
-#define pmd_page_kernel(pmd) \
+#define pmd_page_vaddr(pmd) \
 		((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 
 /*
@@ -418,8 +477,6 @@ extern pte_t *lookup_address(unsigned lo
  static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
 #endif
 
-extern void noexec_setup(const char *str);
-
 #if defined(CONFIG_HIGHPTE)
 #define pte_offset_map(dir, address) \
 	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
@@ -437,37 +494,17 @@ extern void noexec_setup(const char *str
 #define pte_unmap_nested(pte) do { } while (0)
 #endif
 
-#define __HAVE_ARCH_PTEP_ESTABLISH
-#define ptep_establish(vma, address, ptep, pteval)			\
-	do {								\
-		if ( likely((vma)->vm_mm == current->mm) ) {		\
-			BUG_ON(HYPERVISOR_update_va_mapping(address,	\
-				pteval,					\
-				(unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
-					UVMF_INVLPG|UVMF_MULTI));	\
-		} else {						\
-			xen_l1_entry_update(ptep, pteval);		\
-			flush_tlb_page(vma, address);			\
-		}							\
-	} while (0)
+/* Clear a kernel PTE and flush it from the TLB */
+#define kpte_clear_flush(ptep, vaddr) do { \
+	if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
+		BUG(); \
+} while (0)
 
 /*
  * The i386 doesn't have any external MMU info: the kernel page
  * tables contain all the necessary information.
- *
- * Also, we only update the dirty/accessed state if we set
- * the dirty bit by hand in the kernel, since the hardware
- * will do the accessed bit for us, and we don't want to
- * race with other CPU's that might be updating the dirty
- * bit at the same time.
  */
 #define update_mmu_cache(vma,address,pte) do { } while (0)
-#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define ptep_set_access_flags(vma, address, ptep, entry, dirty)		\
-	do {								\
-		if (dirty)						\
-			ptep_establish(vma, address, ptep, entry);	\
-	} while (0)
 
 #include <xen/features.h>
 void make_lowmem_page_readonly(void *va, unsigned int feature);
@@ -516,10 +553,11 @@ int touch_pte_range(struct mm_struct *mm
                     unsigned long size);
 
 int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
-		unsigned long addr, unsigned long end, pgprot_t newprot);
+		unsigned long addr, unsigned long end, pgprot_t newprot,
+		int dirty_accountable);
 
-#define arch_change_pte_range(mm, pmd, addr, end, newprot)	\
-		xen_change_pte_range(mm, pmd, addr, end, newprot)
+#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
+	xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
 
 #define io_remap_pfn_range(vma,from,pfn,size,prot) \
 direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
@@ -528,13 +566,6 @@ direct_remap_pfn_range(vma,from,pfn,size
 #define GET_IOSPACE(pfn)		0
 #define GET_PFN(pfn)			(pfn)
 
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
-#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-#define __HAVE_ARCH_PTE_SAME
 #include <asm-generic/pgtable.h>
 
 #endif /* _I386_PGTABLE_H */
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/processor_32.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/processor_32.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/processor_32.h	2008-07-15 14:24:17.000000000 +0200
@@ -146,6 +146,18 @@ static inline void detect_ht(struct cpui
 #define X86_EFLAGS_VIP	0x00100000 /* Virtual Interrupt Pending */
 #define X86_EFLAGS_ID	0x00200000 /* CPUID detection flag */
 
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+			   unsigned int *ecx, unsigned int *edx)
+{
+	/* ecx is often an input as well as an output. */
+	__asm__(XEN_CPUID
+		: "=a" (*eax),
+		  "=b" (*ebx),
+		  "=c" (*ecx),
+		  "=d" (*edx)
+		: "0" (*eax), "2" (*ecx));
+}
+
 /*
  * Generic CPUID function
  * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
@@ -153,24 +165,18 @@ static inline void detect_ht(struct cpui
  */
 static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
 {
-	__asm__(XEN_CPUID
-		: "=a" (*eax),
-		  "=b" (*ebx),
-		  "=c" (*ecx),
-		  "=d" (*edx)
-		: "0" (op), "c"(0));
+	*eax = op;
+	*ecx = 0;
+	__cpuid(eax, ebx, ecx, edx);
 }
 
 /* Some CPUID calls want 'count' to be placed in ecx */
 static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
-	       	int *edx)
+			       int *edx)
 {
-	__asm__(XEN_CPUID
-		: "=a" (*eax),
-		  "=b" (*ebx),
-		  "=c" (*ecx),
-		  "=d" (*edx)
-		: "0" (op), "c" (count));
+	*eax = op;
+	*ecx = count;
+	__cpuid(eax, ebx, ecx, edx);
 }
 
 /*
@@ -178,42 +184,30 @@ static inline void cpuid_count(int op, i
  */
 static inline unsigned int cpuid_eax(unsigned int op)
 {
-	unsigned int eax;
+	unsigned int eax, ebx, ecx, edx;
 
-	__asm__(XEN_CPUID
-		: "=a" (eax)
-		: "0" (op)
-		: "bx", "cx", "dx");
+	cpuid(op, &eax, &ebx, &ecx, &edx);
 	return eax;
 }
 static inline unsigned int cpuid_ebx(unsigned int op)
 {
-	unsigned int eax, ebx;
+	unsigned int eax, ebx, ecx, edx;
 
-	__asm__(XEN_CPUID
-		: "=a" (eax), "=b" (ebx)
-		: "0" (op)
-		: "cx", "dx" );
+	cpuid(op, &eax, &ebx, &ecx, &edx);
 	return ebx;
 }
 static inline unsigned int cpuid_ecx(unsigned int op)
 {
-	unsigned int eax, ecx;
+	unsigned int eax, ebx, ecx, edx;
 
-	__asm__(XEN_CPUID
-		: "=a" (eax), "=c" (ecx)
-		: "0" (op)
-		: "bx", "dx" );
+	cpuid(op, &eax, &ebx, &ecx, &edx);
 	return ecx;
 }
 static inline unsigned int cpuid_edx(unsigned int op)
 {
-	unsigned int eax, edx;
+	unsigned int eax, ebx, ecx, edx;
 
-	__asm__(XEN_CPUID
-		: "=a" (eax), "=d" (edx)
-		: "0" (op)
-		: "bx", "cx");
+	cpuid(op, &eax, &ebx, &ecx, &edx);
 	return edx;
 }
 
@@ -315,6 +309,8 @@ static inline void __mwait(unsigned long
 		: :"a" (eax), "c" (ecx));
 }
 
+extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
+
 /* from system description table in BIOS.  Mostly for MCA use, but
 others may find it useful. */
 extern unsigned int machine_id;
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/segment_32.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/segment_32.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/segment_32.h	2008-07-15 14:24:17.000000000 +0200
@@ -61,11 +61,9 @@
 
 #define GDT_ENTRY_KERNEL_CS		(GDT_ENTRY_KERNEL_BASE + 0)
 #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
-#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
 
 #define GDT_ENTRY_KERNEL_DS		(GDT_ENTRY_KERNEL_BASE + 1)
 #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
-#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
 
 #define GDT_ENTRY_TSS			(GDT_ENTRY_KERNEL_BASE + 4)
 #define GDT_ENTRY_LDT			(GDT_ENTRY_KERNEL_BASE + 5)
@@ -85,6 +83,11 @@
 
 #define GDT_SIZE (GDT_ENTRIES * 8)
 
+/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
+#define SEGMENT_IS_FLAT_CODE(x)  (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
+/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
+#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
+
 /* Simple and small GDT entries for booting only */
 
 #define GDT_ENTRY_BOOT_CS		2
@@ -114,4 +117,16 @@
  */
 #define IDT_ENTRIES 256
 
+/* Bottom two bits of selector give the ring privilege level */
+#define SEGMENT_RPL_MASK	0x3
+/* Bit 2 is table indicator (LDT/GDT) */
+#define SEGMENT_TI_MASK		0x4
+
+/* User mode is privilege level 3 */
+#define USER_RPL		0x3
+/* LDT segment has TI set, GDT has it cleared */
+#define SEGMENT_LDT		0x4
+#define SEGMENT_GDT		0x0
+
+#define get_kernel_rpl()  (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
 #endif
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/smp_32.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/smp_32.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/smp_32.h	2008-07-15 14:24:17.000000000 +0200
@@ -79,25 +79,36 @@ static inline int hard_smp_processor_id(
 	return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
 }
 #endif
-
-static __inline int logical_smp_processor_id(void)
-{
-	/* we don't want to mark this access volatile - bad code generation */
-	return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
-}
-
 #endif
 
+#define safe_smp_processor_id() smp_processor_id()
 extern int __cpu_disable(void);
 extern void __cpu_die(unsigned int cpu);
 extern void prefill_possible_map(void);
+extern unsigned int num_processors;
+
 #endif /* !__ASSEMBLY__ */
 
 #else /* CONFIG_SMP */
 
+#define safe_smp_processor_id()		0
 #define cpu_physical_id(cpu)		boot_cpu_physical_apicid
 
 #define NO_PROC_ID		0xFF		/* No processor magic marker */
 
 #endif
+
+#ifndef __ASSEMBLY__
+
+extern u8 apicid_2_node[];
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static __inline int logical_smp_processor_id(void)
+{
+	/* we don't want to mark this access volatile - bad code generation */
+	return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
+}
+#endif
+#endif
+
 #endif
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/system_32.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/system_32.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/system_32.h	2008-07-15 14:24:17.000000000 +0200
@@ -267,6 +267,9 @@ static inline unsigned long __xchg(unsig
 #define cmpxchg(ptr,o,n)\
 	((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
 					(unsigned long)(n),sizeof(*(ptr))))
+#define sync_cmpxchg(ptr,o,n)\
+	((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
+					(unsigned long)(n),sizeof(*(ptr))))
 #endif
 
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
@@ -296,6 +299,39 @@ static inline unsigned long __cmpxchg(vo
 	return old;
 }
 
+/*
+ * Always use locked operations when touching memory shared with a
+ * hypervisor, since the system may be SMP even if the guest kernel
+ * isn't.
+ */
+static inline unsigned long __sync_cmpxchg(volatile void *ptr,
+					    unsigned long old,
+					    unsigned long new, int size)
+{
+	unsigned long prev;
+	switch (size) {
+	case 1:
+		__asm__ __volatile__("lock; cmpxchgb %b1,%2"
+				     : "=a"(prev)
+				     : "q"(new), "m"(*__xg(ptr)), "0"(old)
+				     : "memory");
+		return prev;
+	case 2:
+		__asm__ __volatile__("lock; cmpxchgw %w1,%2"
+				     : "=a"(prev)
+				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+				     : "memory");
+		return prev;
+	case 4:
+		__asm__ __volatile__("lock; cmpxchgl %1,%2"
+				     : "=a"(prev)
+				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+				     : "memory");
+		return prev;
+	}
+	return old;
+}
+
 #ifndef CONFIG_X86_CMPXCHG
 /*
  * Building a kernel capable running on 80386. It may be necessary to
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/tlbflush_32.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/tlbflush_32.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/tlbflush_32.h	2008-07-15 14:24:17.000000000 +0200
@@ -8,8 +8,6 @@
 #define __flush_tlb_global() xen_tlb_flush()
 #define __flush_tlb_all() xen_tlb_flush()
 
-extern unsigned long pgkern_mask;
-
 #define cpu_has_invlpg	(boot_cpu_data.x86 > 3)
 
 #define __flush_tlb_single(addr) xen_invlpg(addr)
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/dma-mapping_64.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/dma-mapping_64.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/dma-mapping_64.h	2008-07-15 14:24:17.000000000 +0200
@@ -55,13 +55,6 @@ extern dma_addr_t bad_dma_address;
 extern struct dma_mapping_ops* dma_ops;
 extern int iommu_merge;
 
-static inline int valid_dma_direction(int dma_direction)
-{
-	return ((dma_direction == DMA_BIDIRECTIONAL) ||
-		(dma_direction == DMA_TO_DEVICE) ||
-		(dma_direction == DMA_FROM_DEVICE));
-}
-
 #if 0
 static inline int dma_mapping_error(dma_addr_t dma_addr)
 {
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/e820_64.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/e820_64.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/e820_64.h	2008-07-15 14:24:17.000000000 +0200
@@ -19,13 +19,9 @@
 
 #define E820_RAM	1
 #define E820_RESERVED	2
-#define E820_ACPI	3 /* usable as RAM once ACPI tables have been read */
+#define E820_ACPI	3
 #define E820_NVS	4
 
-#define HIGH_MEMORY	(1024*1024)
-
-#define LOWMEMSIZE()	(0x9f000)
-
 #ifndef __ASSEMBLY__
 struct e820entry {
 	u64 addr;	/* start of memory segment */
@@ -46,17 +42,16 @@ extern void setup_memory_region(void);
 extern void contig_e820_setup(void); 
 extern unsigned long e820_end_of_ram(void);
 extern void e820_reserve_resources(struct e820entry *e820, int nr_map);
+extern void e820_mark_nosave_regions(void);
 extern void e820_print_map(char *who);
 extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
 extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
 
-extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end);
 extern void e820_setup_gap(struct e820entry *e820, int nr_map);
-extern unsigned long e820_hole_size(unsigned long start_pfn,
-				    unsigned long end_pfn);
+extern void e820_register_active_regions(int nid,
+				unsigned long start_pfn, unsigned long end_pfn);
 
-extern void __init parse_memopt(char *p, char **end);
-extern void __init parse_memmapopt(char *p, char **end);
+extern void finish_e820_parsing(void);
 
 extern struct e820map e820;
 
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/fixmap_64.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/fixmap_64.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/fixmap_64.h	2008-07-15 14:24:17.000000000 +0200
@@ -41,7 +41,7 @@ enum fixed_addresses {
 #ifdef CONFIG_X86_LOCAL_APIC
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 #endif
-#ifdef CONFIG_X86_IO_APIC
+#ifndef CONFIG_XEN
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
 #endif
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/hw_irq_64.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/hw_irq_64.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/hw_irq_64.h	2008-07-15 14:24:17.000000000 +0200
@@ -19,8 +19,7 @@
 #include <asm/irq.h>
 #include <linux/profile.h>
 #include <linux/smp.h>
-
-struct hw_interrupt_type;
+#include <linux/percpu.h>
 #endif
 
 #define NMI_VECTOR		0x02
@@ -77,9 +76,10 @@ struct hw_interrupt_type;
 
 
 #ifndef __ASSEMBLY__
-extern u8 irq_vector[NR_IRQ_VECTORS];
-#define IO_APIC_VECTOR(irq)	(irq_vector[irq])
-#define AUTO_ASSIGN		-1
+typedef int vector_irq_t[NR_VECTORS];
+DECLARE_PER_CPU(vector_irq_t, vector_irq);
+extern void __setup_vector_irq(int cpu);
+extern spinlock_t vector_lock;
 
 /*
  * Various low-level irq details needed by irq.c, process.c,
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/io_64.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/io_64.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/io_64.h	2008-07-15 14:24:17.000000000 +0200
@@ -273,33 +273,6 @@ void memset_io(volatile void __iomem *a,
 
 #define eth_io_copy_and_sum(a,b,c,d)		eth_copy_and_sum((a),(void *)(b),(c),(d))
 
-/**
- *	check_signature		-	find BIOS signatures
- *	@io_addr: mmio address to check 
- *	@signature:  signature block
- *	@length: length of signature
- *
- *	Perform a signature comparison with the mmio address io_addr. This
- *	address should have been obtained by ioremap.
- *	Returns 1 on a match.
- */
- 
-static inline int check_signature(void __iomem *io_addr,
-	const unsigned char *signature, int length)
-{
-	int retval = 0;
-	do {
-		if (readb(io_addr) != *signature)
-			goto out;
-		io_addr++;
-		signature++;
-		length--;
-	} while (length);
-	retval = 1;
-out:
-	return retval;
-}
-
 /* Nothing to do */
 
 #define dma_cache_inv(_start,_size)		do { } while (0)
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/pgtable_64.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/pgtable_64.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/pgtable_64.h	2008-07-15 14:24:17.000000000 +0200
@@ -43,12 +43,9 @@ extern unsigned long __supported_pte_mas
 
 #define swapper_pg_dir init_level4_pgt
 
-extern int nonx_setup(char *str);
 extern void paging_init(void);
 extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
 
-extern unsigned long pgkern_mask;
-
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
@@ -118,9 +115,6 @@ static inline void pgd_clear (pgd_t * pg
         set_pgd(__user_pgd(pgd), __pgd(0));
 }
 
-#define pud_page(pud) \
-    ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
-
 #define pte_same(a, b)		((a).pte == (b).pte)
 
 #define pte_pgprot(a)	(__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
@@ -332,7 +326,7 @@ static inline pte_t ptep_get_and_clear_f
 #define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
 static inline int pte_user(pte_t pte)		{ return __pte_val(pte) & _PAGE_USER; }
 static inline int pte_read(pte_t pte)		{ return __pte_val(pte) & _PAGE_USER; }
-static inline int pte_exec(pte_t pte)		{ return __pte_val(pte) & _PAGE_USER; }
+static inline int pte_exec(pte_t pte)		{ return !(__pte_val(pte) & _PAGE_NX); }
 static inline int pte_dirty(pte_t pte)		{ return __pte_val(pte) & _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte)		{ return __pte_val(pte) & _PAGE_ACCESSED; }
 static inline int pte_write(pte_t pte)		{ return __pte_val(pte) & _PAGE_RW; }
@@ -345,29 +339,12 @@ static inline pte_t pte_mkclean(pte_t pt
 static inline pte_t pte_mkold(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
 static inline pte_t pte_wrprotect(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_RW; return pte; }
 static inline pte_t pte_mkread(pte_t pte)	{ __pte_val(pte) |= _PAGE_USER; return pte; }
-static inline pte_t pte_mkexec(pte_t pte)	{ __pte_val(pte) |= _PAGE_USER; return pte; }
+static inline pte_t pte_mkexec(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_NX; return pte; }
 static inline pte_t pte_mkdirty(pte_t pte)	{ __pte_val(pte) |= _PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ __pte_val(pte) |= _PAGE_RW; return pte; }
 static inline pte_t pte_mkhuge(pte_t pte)	{ __pte_val(pte) |= _PAGE_PSE; return pte; }
-
-#define ptep_test_and_clear_dirty(vma, addr, ptep)			\
-({									\
-	pte_t __pte = *(ptep);						\
-	int __ret = pte_dirty(__pte);					\
-	if (__ret)							\
-		set_pte_at((vma)->vm_mm, addr, ptep, pte_mkclean(__pte)); \
-	__ret;								\
-})
-
-#define ptep_test_and_clear_young(vma, addr, ptep)			\
-({									\
-	pte_t __pte = *(ptep);						\
-	int __ret = pte_young(__pte);					\
-	if (__ret)							\
-		set_pte_at((vma)->vm_mm, addr, ptep, pte_mkold(__pte)); \
-	__ret;								\
-})
+static inline pte_t pte_clrhuge(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_PSE; return pte; }
 
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
@@ -394,7 +371,8 @@ static inline int pmd_large(pmd_t pte) {
 /*
  * Level 4 access.
  */
-#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
+#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
+#define pgd_page(pgd)		(pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
 #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
 #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
@@ -403,16 +381,18 @@ static inline int pmd_large(pmd_t pte) {
 
 /* PUD - Level3 access */
 /* to find an entry in a page-table-directory. */
+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
+#define pud_page(pud)		(pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
 #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
+#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
 #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
 
 /* PMD  - Level 2 access */
-#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
 
 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
-#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
+#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
                                   pmd_index(address))
 #define pmd_none(x)	(!__pmd_val(x))
 #if CONFIG_XEN_COMPAT <= 0x030002
@@ -443,6 +423,7 @@ static inline pte_t mk_pte_phys(unsigned
 { 
 	unsigned long pteval;
 	pteval = physpage | pgprot_val(pgprot);
+	pteval &= __supported_pte_mask;
 	return __pte(pteval);
 }
  
@@ -464,7 +445,7 @@ static inline pte_t pte_modify(pte_t pte
 
 #define pte_index(address) \
 		(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
+#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
 			pte_index(address))
 
 /* x86-64 always has all page tables mapped. */
@@ -505,6 +486,40 @@ static inline pte_t pte_modify(pte_t pte
 			ptep_establish(vma, address, ptep, entry);	\
 	} while (0)
 
+
+/*
+ * i386 says: We don't actually have these, but we want to advertise
+ * them so that we can encompass the flush here.
+ */
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+
+#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
+#define ptep_clear_flush_dirty(vma, address, ptep)			\
+({									\
+	pte_t __pte = *(ptep);						\
+	int __dirty = pte_dirty(__pte);					\
+	__pte = pte_mkclean(__pte);					\
+	if ((vma)->vm_mm->context.pinned)				\
+		ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
+	else if (__dirty)						\
+		set_pte(ptep, __pte);					\
+	__dirty;							\
+})
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young(vma, address, ptep)			\
+({									\
+	pte_t __pte = *(ptep);						\
+	int __young = pte_young(__pte);					\
+	__pte = pte_mkold(__pte);					\
+	if ((vma)->vm_mm->context.pinned)				\
+		ptep_set_access_flags(vma, address, ptep, __pte, __young); \
+	else if (__young)						\
+		set_pte(ptep, __pte);					\
+	__young;							\
+})
+
 /* Encode and de-code a swap entry */
 #define __swp_type(x)			(((x).val >> 1) & 0x3f)
 #define __swp_offset(x)			((x).val >> 8)
@@ -546,10 +561,11 @@ int touch_pte_range(struct mm_struct *mm
                     unsigned long size);
 
 int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
-		unsigned long addr, unsigned long end, pgprot_t newprot);
+		unsigned long addr, unsigned long end, pgprot_t newprot,
+		int dirty_accountable);
 
-#define arch_change_pte_range(mm, pmd, addr, end, newprot)	\
-		xen_change_pte_range(mm, pmd, addr, end, newprot)
+#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
+	xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
 
 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
 		direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
@@ -571,8 +587,6 @@ int xen_change_pte_range(struct mm_struc
 #define	kc_offset_to_vaddr(o) \
    (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
 
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/processor_64.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/processor_64.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/processor_64.h	2008-07-15 14:24:17.000000000 +0200
@@ -484,6 +484,8 @@ static inline void __mwait(unsigned long
 		: :"a" (eax), "c" (ecx));
 }
 
+extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
+
 #define stack_current() \
 ({								\
 	struct thread_info *ti;					\
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/smp_64.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/smp_64.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/smp_64.h	2008-07-15 14:24:17.000000000 +0200
@@ -4,15 +4,12 @@
 /*
  * We need the APIC definitions automatically as part of 'smp.h'
  */
-#ifndef __ASSEMBLY__
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/bitops.h>
 extern int disable_apic;
-#endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
-#ifndef __ASSEMBLY__
 #include <asm/fixmap.h>
 #include <asm/mpspec.h>
 #ifdef CONFIG_X86_IO_APIC
@@ -21,10 +18,8 @@ extern int disable_apic;
 #include <asm/apic.h>
 #include <asm/thread_info.h>
 #endif
-#endif
 
 #ifdef CONFIG_SMP
-#ifndef ASSEMBLY
 
 #include <asm/pda.h>
 
@@ -41,14 +36,11 @@ extern cpumask_t cpu_initialized;
  
 extern void smp_alloc_memory(void);
 extern volatile unsigned long smp_invalidate_needed;
-extern int pic_mode;
 extern void lock_ipi_call_lock(void);
 extern void unlock_ipi_call_lock(void);
 extern int smp_num_siblings;
 extern void smp_send_reschedule(int cpu);
 void smp_stop_cpu(void);
-extern int smp_call_function_single(int cpuid, void (*func) (void *info),
-				void *info, int retry, int wait);
 
 extern cpumask_t cpu_sibling_map[NR_CPUS];
 extern cpumask_t cpu_core_map[NR_CPUS];
@@ -77,20 +69,16 @@ static inline int hard_smp_processor_id(
 }
 #endif
 
-extern int safe_smp_processor_id(void);
 extern int __cpu_disable(void);
 extern void __cpu_die(unsigned int cpu);
 extern void prefill_possible_map(void);
 extern unsigned num_processors;
 extern unsigned disabled_cpus;
 
-#endif /* !ASSEMBLY */
-
 #define NO_PROC_ID		0xFF		/* No processor magic marker */
 
 #endif
 
-#ifndef ASSEMBLY
 /*
  * Some lowlevel functions might want to know about
  * the real APIC ID <-> CPU # mapping.
@@ -114,11 +102,8 @@ static inline int cpu_present_to_apicid(
 }
 #endif
 
-#endif /* !ASSEMBLY */
-
 #ifndef CONFIG_SMP
 #define stack_smp_processor_id() 0
-#define safe_smp_processor_id() 0
 #define cpu_logical_map(x) (x)
 #else
 #include <asm/thread_info.h>
@@ -130,7 +115,6 @@ static inline int cpu_present_to_apicid(
 })
 #endif
 
-#ifndef __ASSEMBLY__
 #ifdef CONFIG_X86_LOCAL_APIC
 static __inline int logical_smp_processor_id(void)
 {
@@ -138,13 +122,18 @@ static __inline int logical_smp_processo
 	return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
 }
 #endif
-#endif
 
 #ifdef CONFIG_SMP
 #define cpu_physical_id(cpu)		x86_cpu_to_apicid[cpu]
 #else
 #define cpu_physical_id(cpu)		boot_cpu_id
-#endif
-
+static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
+				void *info, int retry, int wait)
+{
+	/* Disable interrupts here? */
+	func(info);
+	return 0;
+}
+#endif /* !CONFIG_SMP */
 #endif
 
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/system_64.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/system_64.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/system_64.h	2008-07-15 14:24:17.000000000 +0200
@@ -24,6 +24,7 @@
 #define __EXTRA_CLOBBER  \
 	,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
 
+/* Save restore flags to clear handle leaking NT */
 #define switch_to(prev,next,last) \
 	asm volatile(SAVE_CONTEXT						    \
 		     "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */	  \
Index: head-2008-07-15/include/asm-x86/mach-xen/asm/tlbflush_64.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/mach-xen/asm/tlbflush_64.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/asm-x86/mach-xen/asm/tlbflush_64.h	2008-07-15 14:24:17.000000000 +0200
@@ -12,9 +12,6 @@
  */
 #define __flush_tlb_global()	xen_tlb_flush()
 
-
-extern unsigned long pgkern_mask;
-
 #define __flush_tlb_all() __flush_tlb_global()
 
 #define __flush_tlb_one(addr)	xen_invlpg((unsigned long)addr)
Index: head-2008-07-15/include/asm-x86/thread_info_64.h
===================================================================
--- head-2008-07-15.orig/include/asm-x86/thread_info_64.h	2008-07-13 23:51:29.000000000 +0200
+++ head-2008-07-15/include/asm-x86/thread_info_64.h	2008-07-15 14:24:17.000000000 +0200
@@ -157,10 +157,14 @@ static inline struct thread_info *stack_
 	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
 
 /* flags to check in __switch_to() */
+#ifndef CONFIG_XEN
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS|_TIF_NOTSC)
 #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
+#else
+#define _TIF_WORK_CTXSW _TIF_DEBUG
+#endif
 
 #define PREEMPT_ACTIVE     0x10000000
 
Index: head-2008-07-15/include/linux/skbuff.h
===================================================================
--- head-2008-07-15.orig/include/linux/skbuff.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/linux/skbuff.h	2008-07-15 14:24:17.000000000 +0200
@@ -1716,5 +1716,12 @@ static inline void skb_forward_csum(stru
 }
 
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
+
+#ifdef CONFIG_XEN
+int skb_checksum_setup(struct sk_buff *skb);
+#else
+static inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
+#endif
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
Index: head-2008-07-15/include/xen/evtchn.h
===================================================================
--- head-2008-07-15.orig/include/xen/evtchn.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/xen/evtchn.h	2008-07-15 14:24:17.000000000 +0200
@@ -54,34 +54,34 @@
  */
 int bind_caller_port_to_irqhandler(
 	unsigned int caller_port,
-	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	irq_handler_t handler,
 	unsigned long irqflags,
 	const char *devname,
 	void *dev_id);
 int bind_listening_port_to_irqhandler(
 	unsigned int remote_domain,
-	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	irq_handler_t handler,
 	unsigned long irqflags,
 	const char *devname,
 	void *dev_id);
 int bind_interdomain_evtchn_to_irqhandler(
 	unsigned int remote_domain,
 	unsigned int remote_port,
-	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	irq_handler_t handler,
 	unsigned long irqflags,
 	const char *devname,
 	void *dev_id);
 int bind_virq_to_irqhandler(
 	unsigned int virq,
 	unsigned int cpu,
-	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	irq_handler_t handler,
 	unsigned long irqflags,
 	const char *devname,
 	void *dev_id);
 int bind_ipi_to_irqhandler(
 	unsigned int ipi,
 	unsigned int cpu,
-	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	irq_handler_t handler,
 	unsigned long irqflags,
 	const char *devname,
 	void *dev_id);
Index: head-2008-07-15/include/xen/xencons.h
===================================================================
--- head-2008-07-15.orig/include/xen/xencons.h	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/include/xen/xencons.h	2008-07-15 14:24:17.000000000 +0200
@@ -8,7 +8,7 @@ void xencons_force_flush(void);
 void xencons_resume(void);
 
 /* Interrupt work hooks. Receive data, or kick data out. */
-void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
+void xencons_rx(char *buf, unsigned len);
 void xencons_tx(void);
 
 int xencons_ring_init(void);
Index: head-2008-07-15/mm/mprotect.c
===================================================================
--- head-2008-07-15.orig/mm/mprotect.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/mm/mprotect.c	2008-07-15 14:24:17.000000000 +0200
@@ -93,7 +93,7 @@ static inline void change_pmd_range(stru
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		if (arch_change_pte_range(mm, pmd, addr, next, newprot))
+		if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
 			continue;
 		change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
 	} while (pmd++, addr = next, addr != end);
Index: head-2008-07-15/net/core/dev.c
===================================================================
--- head-2008-07-15.orig/net/core/dev.c	2008-07-16 12:40:44.000000000 +0200
+++ head-2008-07-15/net/core/dev.c	2008-07-15 14:24:17.000000000 +0200
@@ -1643,15 +1643,14 @@ inline int skb_checksum_setup(struct sk_
 		}
 		if ((skb->h.raw + skb->csum + 2) > skb->tail)
 			goto out;
-		skb->ip_summed = CHECKSUM_HW;
+		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->proto_csum_blank = 0;
 	}
 	return 0;
 out:
 	return -EPROTO;
 }
-#else
-inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
+EXPORT_SYMBOL(skb_checksum_setup);
 #endif
 
 /**
@@ -2145,7 +2144,7 @@ int netif_receive_skb(struct sk_buff *sk
 	case CHECKSUM_UNNECESSARY:
 		skb->proto_data_valid = 1;
 		break;
-	case CHECKSUM_HW:
+	case CHECKSUM_PARTIAL:
 		/* XXX Implement me. */
 	default:
 		skb->proto_data_valid = 0;
@@ -4697,7 +4696,6 @@ EXPORT_SYMBOL(unregister_netdevice_notif
 EXPORT_SYMBOL(net_enable_timestamp);
 EXPORT_SYMBOL(net_disable_timestamp);
 EXPORT_SYMBOL(dev_get_flags);
-EXPORT_SYMBOL(skb_checksum_setup);
 
 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
 EXPORT_SYMBOL(br_handle_frame_hook);