From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: Linux: 3.3
Patch-mainline: 3.3
This patch contains the differences between 3.2 and 3.3.
Automatically created from "patch-3.3" by xen-port-patches.py
Acked-by: jbeulich@suse.com
3.4/arch/x86/include/mach-xen/asm/i387.h (moved to fpu-internal.h)
3.13/drivers/acpi/processor_core.c (needs re-implementation)
3.13/drivers/acpi/processor_perflib.c (acpi_processor_load_module() disappeared)
--- head.orig/arch/x86/ia32/ia32entry-xen.S 2011-11-17 15:56:06.000000000 +0100
+++ head/arch/x86/ia32/ia32entry-xen.S 2012-02-09 12:46:23.000000000 +0100
@@ -14,6 +14,7 @@
#include <asm/segment.h>
#include <asm/irqflags.h>
#include <linux/linkage.h>
+#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -22,8 +23,6 @@
.section .entry.text, "ax"
-#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
-
.macro IA32_ARG_FIXUP noebp=0
movl %edi,%r8d
.if \noebp
@@ -128,7 +127,7 @@ ENTRY(ia32_sysenter_target)
CFI_RESTORE rcx
movl %ebp,%ebp /* zero extension */
movl %eax,%eax
- movl 48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d
+ movl TI_sysenter_return+THREAD_INFO(%rsp,8*6-KERNEL_STACK_OFFSET),%r10d
movl $__USER32_DS,40(%rsp)
movq %rbp,32(%rsp)
movl $__USER32_CS,16(%rsp)
@@ -142,9 +141,8 @@ ENTRY(ia32_sysenter_target)
.section __ex_table,"a"
.quad 1b,ia32_badarg
.previous
- GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,TI_status(%r10)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz sysenter_tracesys
jmp .Lia32_check_call
@@ -156,7 +154,7 @@ ENTRY(ia32_sysenter_target)
movl %ebx,%edx /* 3rd arg: 1st syscall arg */
movl %eax,%esi /* 2nd arg: syscall number */
movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
- call audit_syscall_entry
+ call __audit_syscall_entry
movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
@@ -219,9 +217,8 @@ ENTRY(ia32_cstar_target)
.section __ex_table,"a"
.quad 1b,ia32_badarg
.previous
- GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,TI_status(%r10)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz cstar_tracesys
cmpq $IA32_NR_syscalls-1,%rax
ja ia32_badsys
@@ -238,7 +235,7 @@ cstar_auditsys:
cstar_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz cstar_auditsys
#endif
xchgl %r9d,%ebp
@@ -302,9 +299,8 @@ ENTRY(ia32_syscall)
/* note the registers are not zero extended to the sf.
this could be a problem. */
SAVE_ARGS 0,1,0
- GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,TI_status(%r10)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz ia32_tracesys
.Lia32_check_call:
cmpq $(IA32_NR_syscalls-1),%rax
@@ -320,7 +316,7 @@ ia32_sysret:
sysenter_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz sysenter_auditsys
#endif
ia32_tracesys:
@@ -341,14 +337,11 @@ ia32_badsys:
movq $-ENOSYS,%rax
jmp ia32_sysret
-quiet_ni_syscall:
- movq $-ENOSYS,%rax
- ret
CFI_ENDPROC
.macro PTREGSCALL label, func, arg
- .globl \label
-\label:
+ ALIGN
+GLOBAL(\label)
leaq \func(%rip),%rax
leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
jmp ia32_ptregs_common
@@ -365,7 +358,8 @@ quiet_ni_syscall:
PTREGSCALL stub32_vfork, sys_vfork, %rdi
PTREGSCALL stub32_iopl, sys_iopl, %rsi
-ENTRY(ia32_ptregs_common)
+ ALIGN
+ia32_ptregs_common:
popq %r11
CFI_ENDPROC
CFI_STARTPROC32 simple
@@ -387,357 +381,3 @@ ENTRY(ia32_ptregs_common)
jmp ia32_sysret /* misbalances the return cache */
CFI_ENDPROC
END(ia32_ptregs_common)
-
- .section .rodata,"a"
- .align 8
-ia32_sys_call_table:
- .quad sys_restart_syscall
- .quad sys_exit
- .quad stub32_fork
- .quad sys_read
- .quad sys_write
- .quad compat_sys_open /* 5 */
- .quad sys_close
- .quad sys32_waitpid
- .quad sys_creat
- .quad sys_link
- .quad sys_unlink /* 10 */
- .quad stub32_execve
- .quad sys_chdir
- .quad compat_sys_time
- .quad sys_mknod
- .quad sys_chmod /* 15 */
- .quad sys_lchown16
- .quad quiet_ni_syscall /* old break syscall holder */
- .quad sys_stat
- .quad sys32_lseek
- .quad sys_getpid /* 20 */
- .quad compat_sys_mount /* mount */
- .quad sys_oldumount /* old_umount */
- .quad sys_setuid16
- .quad sys_getuid16
- .quad compat_sys_stime /* stime */ /* 25 */
- .quad compat_sys_ptrace /* ptrace */
- .quad sys_alarm
- .quad sys_fstat /* (old)fstat */
- .quad sys_pause
- .quad compat_sys_utime /* 30 */
- .quad quiet_ni_syscall /* old stty syscall holder */
- .quad quiet_ni_syscall /* old gtty syscall holder */
- .quad sys_access
- .quad sys_nice
- .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
- .quad sys_sync
- .quad sys32_kill
- .quad sys_rename
- .quad sys_mkdir
- .quad sys_rmdir /* 40 */
- .quad sys_dup
- .quad sys_pipe
- .quad compat_sys_times
- .quad quiet_ni_syscall /* old prof syscall holder */
- .quad sys_brk /* 45 */
- .quad sys_setgid16
- .quad sys_getgid16
- .quad sys_signal
- .quad sys_geteuid16
- .quad sys_getegid16 /* 50 */
- .quad sys_acct
- .quad sys_umount /* new_umount */
- .quad quiet_ni_syscall /* old lock syscall holder */
- .quad compat_sys_ioctl
- .quad compat_sys_fcntl64 /* 55 */
- .quad quiet_ni_syscall /* old mpx syscall holder */
- .quad sys_setpgid
- .quad quiet_ni_syscall /* old ulimit syscall holder */
- .quad sys_olduname
- .quad sys_umask /* 60 */
- .quad sys_chroot
- .quad compat_sys_ustat
- .quad sys_dup2
- .quad sys_getppid
- .quad sys_getpgrp /* 65 */
- .quad sys_setsid
- .quad sys32_sigaction
- .quad sys_sgetmask
- .quad sys_ssetmask
- .quad sys_setreuid16 /* 70 */
- .quad sys_setregid16
- .quad sys32_sigsuspend
- .quad compat_sys_sigpending
- .quad sys_sethostname
- .quad compat_sys_setrlimit /* 75 */
- .quad compat_sys_old_getrlimit /* old_getrlimit */
- .quad compat_sys_getrusage
- .quad compat_sys_gettimeofday
- .quad compat_sys_settimeofday
- .quad sys_getgroups16 /* 80 */
- .quad sys_setgroups16
- .quad compat_sys_old_select
- .quad sys_symlink
- .quad sys_lstat
- .quad sys_readlink /* 85 */
- .quad sys_uselib
- .quad sys_swapon
- .quad sys_reboot
- .quad compat_sys_old_readdir
- .quad sys32_mmap /* 90 */
- .quad sys_munmap
- .quad sys_truncate
- .quad sys_ftruncate
- .quad sys_fchmod
- .quad sys_fchown16 /* 95 */
- .quad sys_getpriority
- .quad sys_setpriority
- .quad quiet_ni_syscall /* old profil syscall holder */
- .quad compat_sys_statfs
- .quad compat_sys_fstatfs /* 100 */
- .quad sys_ioperm
- .quad compat_sys_socketcall
- .quad sys_syslog
- .quad compat_sys_setitimer
- .quad compat_sys_getitimer /* 105 */
- .quad compat_sys_newstat
- .quad compat_sys_newlstat
- .quad compat_sys_newfstat
- .quad sys_uname
- .quad stub32_iopl /* 110 */
- .quad sys_vhangup
- .quad quiet_ni_syscall /* old "idle" system call */
- .quad sys32_vm86_warning /* vm86old */
- .quad compat_sys_wait4
- .quad sys_swapoff /* 115 */
- .quad compat_sys_sysinfo
- .quad sys32_ipc
- .quad sys_fsync
- .quad stub32_sigreturn
- .quad stub32_clone /* 120 */
- .quad sys_setdomainname
- .quad sys_newuname
- .quad sys_modify_ldt
- .quad compat_sys_adjtimex
- .quad sys32_mprotect /* 125 */
- .quad compat_sys_sigprocmask
- .quad quiet_ni_syscall /* create_module */
- .quad sys_init_module
- .quad sys_delete_module
- .quad quiet_ni_syscall /* 130 get_kernel_syms */
- .quad sys32_quotactl
- .quad sys_getpgid
- .quad sys_fchdir
- .quad quiet_ni_syscall /* bdflush */
- .quad sys_sysfs /* 135 */
- .quad sys_personality
- .quad quiet_ni_syscall /* for afs_syscall */
- .quad sys_setfsuid16
- .quad sys_setfsgid16
- .quad sys_llseek /* 140 */
- .quad compat_sys_getdents
- .quad compat_sys_select
- .quad sys_flock
- .quad sys_msync
- .quad compat_sys_readv /* 145 */
- .quad compat_sys_writev
- .quad sys_getsid
- .quad sys_fdatasync
- .quad compat_sys_sysctl /* sysctl */
- .quad sys_mlock /* 150 */
- .quad sys_munlock
- .quad sys_mlockall
- .quad sys_munlockall
- .quad sys_sched_setparam
- .quad sys_sched_getparam /* 155 */
- .quad sys_sched_setscheduler
- .quad sys_sched_getscheduler
- .quad sys_sched_yield
- .quad sys_sched_get_priority_max
- .quad sys_sched_get_priority_min /* 160 */
- .quad sys32_sched_rr_get_interval
- .quad compat_sys_nanosleep
- .quad sys_mremap
- .quad sys_setresuid16
- .quad sys_getresuid16 /* 165 */
- .quad sys32_vm86_warning /* vm86 */
- .quad quiet_ni_syscall /* query_module */
- .quad sys_poll
- .quad quiet_ni_syscall /* old nfsservctl */
- .quad sys_setresgid16 /* 170 */
- .quad sys_getresgid16
- .quad sys_prctl
- .quad stub32_rt_sigreturn
- .quad sys32_rt_sigaction
- .quad sys32_rt_sigprocmask /* 175 */
- .quad sys32_rt_sigpending
- .quad compat_sys_rt_sigtimedwait
- .quad sys32_rt_sigqueueinfo
- .quad sys_rt_sigsuspend
- .quad sys32_pread /* 180 */
- .quad sys32_pwrite
- .quad sys_chown16
- .quad sys_getcwd
- .quad sys_capget
- .quad sys_capset
- .quad stub32_sigaltstack
- .quad sys32_sendfile
- .quad quiet_ni_syscall /* streams1 */
- .quad quiet_ni_syscall /* streams2 */
- .quad stub32_vfork /* 190 */
- .quad compat_sys_getrlimit
- .quad sys_mmap_pgoff
- .quad sys32_truncate64
- .quad sys32_ftruncate64
- .quad sys32_stat64 /* 195 */
- .quad sys32_lstat64
- .quad sys32_fstat64
- .quad sys_lchown
- .quad sys_getuid
- .quad sys_getgid /* 200 */
- .quad sys_geteuid
- .quad sys_getegid
- .quad sys_setreuid
- .quad sys_setregid
- .quad sys_getgroups /* 205 */
- .quad sys_setgroups
- .quad sys_fchown
- .quad sys_setresuid
- .quad sys_getresuid
- .quad sys_setresgid /* 210 */
- .quad sys_getresgid
- .quad sys_chown
- .quad sys_setuid
- .quad sys_setgid
- .quad sys_setfsuid /* 215 */
- .quad sys_setfsgid
- .quad sys_pivot_root
- .quad sys_mincore
- .quad sys_madvise
- .quad compat_sys_getdents64 /* 220 getdents64 */
- .quad compat_sys_fcntl64
- .quad quiet_ni_syscall /* tux */
- .quad quiet_ni_syscall /* security */
- .quad sys_gettid
- .quad sys32_readahead /* 225 */
- .quad sys_setxattr
- .quad sys_lsetxattr
- .quad sys_fsetxattr
- .quad sys_getxattr
- .quad sys_lgetxattr /* 230 */
- .quad sys_fgetxattr
- .quad sys_listxattr
- .quad sys_llistxattr
- .quad sys_flistxattr
- .quad sys_removexattr /* 235 */
- .quad sys_lremovexattr
- .quad sys_fremovexattr
- .quad sys_tkill
- .quad sys_sendfile64
- .quad compat_sys_futex /* 240 */
- .quad compat_sys_sched_setaffinity
- .quad compat_sys_sched_getaffinity
- .quad sys_set_thread_area
- .quad sys_get_thread_area
- .quad compat_sys_io_setup /* 245 */
- .quad sys_io_destroy
- .quad compat_sys_io_getevents
- .quad compat_sys_io_submit
- .quad sys_io_cancel
- .quad sys32_fadvise64 /* 250 */
- .quad quiet_ni_syscall /* free_huge_pages */
- .quad sys_exit_group
- .quad sys32_lookup_dcookie
- .quad sys_epoll_create
- .quad sys_epoll_ctl /* 255 */
- .quad sys_epoll_wait
- .quad sys_remap_file_pages
- .quad sys_set_tid_address
- .quad compat_sys_timer_create
- .quad compat_sys_timer_settime /* 260 */
- .quad compat_sys_timer_gettime
- .quad sys_timer_getoverrun
- .quad sys_timer_delete
- .quad compat_sys_clock_settime
- .quad compat_sys_clock_gettime /* 265 */
- .quad compat_sys_clock_getres
- .quad compat_sys_clock_nanosleep
- .quad compat_sys_statfs64
- .quad compat_sys_fstatfs64
- .quad sys_tgkill /* 270 */
- .quad compat_sys_utimes
- .quad sys32_fadvise64_64
- .quad quiet_ni_syscall /* sys_vserver */
- .quad sys_mbind
- .quad compat_sys_get_mempolicy /* 275 */
- .quad sys_set_mempolicy
- .quad compat_sys_mq_open
- .quad sys_mq_unlink
- .quad compat_sys_mq_timedsend
- .quad compat_sys_mq_timedreceive /* 280 */
- .quad compat_sys_mq_notify
- .quad compat_sys_mq_getsetattr
- .quad compat_sys_kexec_load /* reserved for kexec */
- .quad compat_sys_waitid
- .quad quiet_ni_syscall /* 285: sys_altroot */
- .quad sys_add_key
- .quad sys_request_key
- .quad sys_keyctl
- .quad sys_ioprio_set
- .quad sys_ioprio_get /* 290 */
- .quad sys_inotify_init
- .quad sys_inotify_add_watch
- .quad sys_inotify_rm_watch
- .quad sys_migrate_pages
- .quad compat_sys_openat /* 295 */
- .quad sys_mkdirat
- .quad sys_mknodat
- .quad sys_fchownat
- .quad compat_sys_futimesat
- .quad sys32_fstatat /* 300 */
- .quad sys_unlinkat
- .quad sys_renameat
- .quad sys_linkat
- .quad sys_symlinkat
- .quad sys_readlinkat /* 305 */
- .quad sys_fchmodat
- .quad sys_faccessat
- .quad compat_sys_pselect6
- .quad compat_sys_ppoll
- .quad sys_unshare /* 310 */
- .quad compat_sys_set_robust_list
- .quad compat_sys_get_robust_list
- .quad sys_splice
- .quad sys32_sync_file_range
- .quad sys_tee /* 315 */
- .quad compat_sys_vmsplice
- .quad compat_sys_move_pages
- .quad sys_getcpu
- .quad sys_epoll_pwait
- .quad compat_sys_utimensat /* 320 */
- .quad compat_sys_signalfd
- .quad sys_timerfd_create
- .quad sys_eventfd
- .quad sys32_fallocate
- .quad compat_sys_timerfd_settime /* 325 */
- .quad compat_sys_timerfd_gettime
- .quad compat_sys_signalfd4
- .quad sys_eventfd2
- .quad sys_epoll_create1
- .quad sys_dup3 /* 330 */
- .quad sys_pipe2
- .quad sys_inotify_init1
- .quad compat_sys_preadv
- .quad compat_sys_pwritev
- .quad compat_sys_rt_tgsigqueueinfo /* 335 */
- .quad sys_perf_event_open
- .quad compat_sys_recvmmsg
- .quad sys_fanotify_init
- .quad sys32_fanotify_mark
- .quad sys_prlimit64 /* 340 */
- .quad sys_name_to_handle_at
- .quad compat_sys_open_by_handle_at
- .quad compat_sys_clock_adjtime
- .quad sys_syncfs
- .quad compat_sys_sendmmsg /* 345 */
- .quad sys_setns
- .quad compat_sys_process_vm_readv
- .quad compat_sys_process_vm_writev
-ia32_syscall_end:
--- head.orig/arch/x86/include/asm/debugreg.h 2014-05-06 08:31:14.000000000 +0200
+++ head/arch/x86/include/asm/debugreg.h 2013-01-08 12:02:11.000000000 +0100
@@ -93,7 +93,7 @@ extern void aout_dump_debugregs(struct u
extern void hw_breakpoint_restore(void);
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_IDT)
DECLARE_PER_CPU(int, debug_stack_usage);
static inline void debug_stack_usage_inc(void)
{
--- head.orig/arch/x86/include/mach-xen/asm/desc.h 2011-09-08 16:54:08.000000000 +0200
+++ head/arch/x86/include/mach-xen/asm/desc.h 2012-02-09 12:32:50.000000000 +0100
@@ -36,6 +36,8 @@ static inline void fill_ldt(struct desc_
#ifndef CONFIG_X86_NO_IDT
extern struct desc_ptr idt_descr;
extern gate_desc idt_table[];
+extern struct desc_ptr nmi_idt_descr;
+extern gate_desc nmi_idt_table[];
#endif
struct gdt_page {
@@ -332,6 +334,16 @@ static inline void set_desc_limit(struct
}
#ifndef CONFIG_X86_NO_IDT
+#ifdef CONFIG_X86_64
+static inline void set_nmi_gate(int gate, void *addr)
+{
+ gate_desc s;
+
+ pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
+ write_idt_entry(nmi_idt_table, gate, &s);
+}
+#endif
+
static inline void _set_gate(int gate, unsigned type, void *addr,
unsigned dpl, unsigned ist, unsigned seg)
{
--- head.orig/arch/x86/include/mach-xen/asm/fixmap.h 2011-09-08 16:54:08.000000000 +0200
+++ head/arch/x86/include/mach-xen/asm/fixmap.h 2012-02-09 12:32:50.000000000 +0100
@@ -124,7 +124,7 @@ enum fixed_addresses {
#endif
FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
-#ifdef CONFIG_X86_MRST
+#ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC,
#endif
__end_of_permanent_fixed_addresses,
--- head.orig/arch/x86/include/mach-xen/asm/pci.h 2011-07-01 15:19:34.000000000 +0200
+++ head/arch/x86/include/mach-xen/asm/pci.h 2012-02-09 12:32:50.000000000 +0100
@@ -118,19 +118,28 @@ static inline void x86_teardown_msi_irq(
{
x86_msi.teardown_msi_irq(irq);
}
+static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+ x86_msi.restore_msi_irqs(dev, irq);
+}
#define arch_setup_msi_irqs x86_setup_msi_irqs
#define arch_teardown_msi_irqs x86_teardown_msi_irqs
#define arch_teardown_msi_irq x86_teardown_msi_irq
+#define arch_restore_msi_irqs x86_restore_msi_irqs
/* implemented in arch/x86/kernel/apic/io_apic. */
int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
void native_teardown_msi_irq(unsigned int irq);
+void native_restore_msi_irqs(struct pci_dev *dev, int irq);
/* default to the implementation in drivers/lib/msi.c */
#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
+#define HAVE_DEFAULT_MSI_RESTORE_IRQS
void default_teardown_msi_irqs(struct pci_dev *dev);
+void default_restore_msi_irqs(struct pci_dev *dev, int irq);
#else
#define native_setup_msi_irqs NULL
#define native_teardown_msi_irq NULL
#define default_teardown_msi_irqs NULL
+#define default_restore_msi_irqs NULL
#endif
#define PCI_DMA_BUS_IS_PHYS 0
--- head.orig/arch/x86/include/mach-xen/asm/pgtable.h 2011-03-23 10:10:00.000000000 +0100
+++ head/arch/x86/include/mach-xen/asm/pgtable.h 2012-02-09 12:32:50.000000000 +0100
@@ -738,7 +738,7 @@ static inline void ptep_set_wrprotect(st
set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
}
-#define flush_tlb_fix_spurious_fault(vma, address)
+#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
--- head.orig/arch/x86/include/mach-xen/asm/processor.h 2011-11-17 16:53:30.000000000 +0100
+++ head/arch/x86/include/mach-xen/asm/processor.h 2012-02-29 10:59:05.000000000 +0100
@@ -109,7 +109,7 @@ struct cpuinfo_x86 {
u16 initial_apicid;
#endif
u16 x86_clflush_size;
-#ifdef CONFIG_X86_HT
+#ifndef CONFIG_XEN
/* number of cores as seen by the OS: */
u16 booted_cores;
/* Physical processor id: */
@@ -119,10 +119,8 @@ struct cpuinfo_x86 {
/* Compute unit id */
u8 compute_unit_id;
#endif
-#ifdef CONFIG_SMP
/* Index into per_cpu list: */
u16 cpu_index;
-#endif
#ifndef CONFIG_XEN
u32 microcode;
#endif
@@ -394,6 +392,8 @@ union thread_xstate {
};
struct fpu {
+ unsigned int last_cpu;
+ unsigned int has_fpu;
union thread_xstate *state;
};
--- head.orig/arch/x86/include/mach-xen/asm/smp.h 2011-04-13 17:01:31.000000000 +0200
+++ head/arch/x86/include/mach-xen/asm/smp.h 2012-02-09 12:32:50.000000000 +0100
@@ -231,5 +231,11 @@ extern int hard_smp_processor_id(void);
#endif /* CONFIG_X86_LOCAL_APIC */
+#ifdef CONFIG_DEBUG_NMI_SELFTEST
+extern void nmi_selftest(void);
+#else
+#define nmi_selftest() do { } while (0)
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_SMP_H */
--- head.orig/arch/x86/include/mach-xen/asm/spinlock.h 2012-02-01 09:13:39.000000000 +0100
+++ head/arch/x86/include/mach-xen/asm/spinlock.h 2012-02-09 12:49:39.000000000 +0100
@@ -137,19 +137,8 @@ static __always_inline void __ticket_spi
{
register struct __raw_tickets new;
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
-# define UNLOCK_SUFFIX(n) "%z" #n
-#elif TICKET_SHIFT == 8
-# define UNLOCK_SUFFIX(n) "b"
-#elif TICKET_SHIFT == 16
-# define UNLOCK_SUFFIX(n) "w"
-#endif
- asm volatile(UNLOCK_LOCK_PREFIX "inc" UNLOCK_SUFFIX(0) " %0"
- : "+m" (lock->tickets.head)
- :
- : "memory", "cc");
+ __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
#if !defined(XEN_SPINLOCK_SOURCE) || !CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
-# undef UNLOCK_SUFFIX
# undef UNLOCK_LOCK_PREFIX
#endif
new = ACCESS_ONCE(lock->tickets);
--- head.orig/arch/x86/kernel/apic/io_apic-xen.c 2011-11-28 10:08:44.000000000 +0100
+++ head/arch/x86/kernel/apic/io_apic-xen.c 2012-02-09 12:32:50.000000000 +0100
@@ -2498,8 +2498,8 @@ asmlinkage void smp_irq_move_cleanup_int
unsigned vector, me;
ack_APIC_irq();
- exit_idle();
irq_enter();
+ exit_idle();
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -3031,6 +3031,10 @@ static inline void __init check_timer(vo
}
local_irq_disable();
apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+ if (x2apic_preenabled)
+ apic_printk(APIC_QUIET, KERN_INFO
+ "Perhaps problem with the pre-enabled x2apic mode\n"
+ "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
out:
--- head.orig/arch/x86/kernel/cpu/common-xen.c 2012-08-01 12:05:51.000000000 +0200
+++ head/arch/x86/kernel/cpu/common-xen.c 2012-08-01 12:12:01.000000000 +0200
@@ -725,9 +725,7 @@ static void __init early_identify_cpu(st
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
-#ifdef CONFIG_SMP
c->cpu_index = 0;
-#endif
filter_cpuid_features(c, false);
setup_smep(c);
@@ -814,10 +812,7 @@ static void __cpuinit generic_identify(s
c->apicid = c->initial_apicid;
# endif
#endif
-
-#ifdef CONFIG_X86_HT
c->phys_proc_id = c->initial_apicid;
-#endif
}
#endif
@@ -1086,6 +1081,8 @@ __setup("clearcpuid=", setup_disablecpui
#ifdef CONFIG_X86_64
#ifndef CONFIG_X86_NO_IDT
struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
+struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
+ (unsigned long) nmi_idt_table };
#endif
DEFINE_PER_CPU_FIRST(union irq_stack_union,
@@ -1116,6 +1113,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
DEFINE_PER_CPU(unsigned int, irq_count) = -1;
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
+
#ifndef CONFIG_X86_NO_TSS
/*
* Special IST stacks which the CPU switches to when it calls
@@ -1177,10 +1177,34 @@ unsigned long kernel_eflags;
DEFINE_PER_CPU(struct orig_ist, orig_ist);
#endif
+#ifndef CONFIG_X86_NO_IDT
+static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+DEFINE_PER_CPU(int, debug_stack_usage);
+
+int is_debug_stack(unsigned long addr)
+{
+ return __get_cpu_var(debug_stack_usage) ||
+ (addr <= __get_cpu_var(debug_stack_addr) &&
+ addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
+}
+
+void debug_stack_set_zero(void)
+{
+ load_idt((const struct desc_ptr *)&nmi_idt_descr);
+}
+
+void debug_stack_reset(void)
+{
+ load_idt((const struct desc_ptr *)&idt_descr);
+}
+#endif
+
#else /* CONFIG_X86_64 */
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
#ifdef CONFIG_CC_STACKPROTECTOR
DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
@@ -1227,6 +1251,17 @@ static void dbg_restore_debug_regs(void)
#define dbg_restore_debug_regs()
#endif /* ! CONFIG_KGDB */
+#ifndef CONFIG_XEN
+/*
+ * Prints an error where the NUMA and configured core-number mismatch and the
+ * platform didn't override this to fix it up
+ */
+void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+ pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id);
+}
+#endif
+
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
@@ -1307,6 +1342,10 @@ void __cpuinit cpu_init(void)
estacks += exception_stack_sizes[v];
oist->ist[v] = t->x86_tss.ist[v] =
(unsigned long)estacks;
+#ifndef CONFIG_X86_NO_IDT
+ if (v == DEBUG_STACK-1)
+ per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
+#endif
}
}
--- head.orig/arch/x86/kernel/cpu/mcheck/mce-inject.c 2013-08-09 14:55:42.000000000 +0200
+++ head/arch/x86/kernel/cpu/mcheck/mce-inject.c 2012-10-23 15:56:41.000000000 +0200
@@ -94,6 +94,7 @@ static int mce_raise_notify(unsigned int
return NMI_HANDLED;
}
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
static void mce_irq_ipi(void *info)
{
int cpu = smp_processor_id();
@@ -105,6 +106,7 @@ static void mce_irq_ipi(void *info)
raise_exception(m, NULL);
}
}
+#endif
/* Inject mce on current CPU */
static int raise_local(void)
--- head.orig/arch/x86/kernel/e820-xen.c 2011-12-21 12:00:26.000000000 +0100
+++ head/arch/x86/kernel/e820-xen.c 2012-02-16 17:12:00.000000000 +0100
@@ -19,6 +19,7 @@
#include <linux/acpi.h>
#include <linux/firmware-map.h>
#include <linux/memblock.h>
+#include <linux/sort.h>
#include <asm/e820.h>
#include <asm/proto.h>
@@ -250,22 +251,38 @@ static void __init _e820_print_map(const
* ____________________33__
* ______________________4_
*/
+struct change_member {
+ struct e820entry *pbios; /* pointer to original bios entry */
+ unsigned long long addr; /* address for this change point */
+};
+
+static int __init cpcompare(const void *a, const void *b)
+{
+ struct change_member * const *app = a, * const *bpp = b;
+ const struct change_member *ap = *app, *bp = *bpp;
+
+ /*
+ * Inputs are pointers to two elements of change_point[]. If their
+ * addresses are unequal, their difference dominates. If the addresses
+ * are equal, then consider one that represents the end of its region
+ * to be greater than one that does not.
+ */
+ if (ap->addr != bp->addr)
+ return ap->addr > bp->addr ? 1 : -1;
+
+ return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
+}
int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
u32 *pnr_map)
{
- struct change_member {
- struct e820entry *pbios; /* pointer to original bios entry */
- unsigned long long addr; /* address for this change point */
- };
static struct change_member change_point_list[2*E820_X_MAX] __initdata;
static struct change_member *change_point[2*E820_X_MAX] __initdata;
static struct e820entry *overlap_list[E820_X_MAX] __initdata;
static struct e820entry new_bios[E820_X_MAX] __initdata;
- struct change_member *change_tmp;
unsigned long current_type, last_type;
unsigned long long last_addr;
- int chgidx, still_changing;
+ int chgidx;
int overlap_entries;
int new_bios_entry;
int old_nr, new_nr, chg_nr;
@@ -306,35 +323,7 @@ int __init sanitize_e820_map(struct e820
chg_nr = chgidx;
/* sort change-point list by memory addresses (low -> high) */
- still_changing = 1;
- while (still_changing) {
- still_changing = 0;
- for (i = 1; i < chg_nr; i++) {
- unsigned long long curaddr, lastaddr;
- unsigned long long curpbaddr, lastpbaddr;
-
- curaddr = change_point[i]->addr;
- lastaddr = change_point[i - 1]->addr;
- curpbaddr = change_point[i]->pbios->addr;
- lastpbaddr = change_point[i - 1]->pbios->addr;
-
- /*
- * swap entries, when:
- *
- * curaddr > lastaddr or
- * curaddr == lastaddr and curaddr == curpbaddr and
- * lastaddr != lastpbaddr
- */
- if (curaddr < lastaddr ||
- (curaddr == lastaddr && curaddr == curpbaddr &&
- lastaddr != lastpbaddr)) {
- change_tmp = change_point[i];
- change_point[i] = change_point[i-1];
- change_point[i-1] = change_tmp;
- still_changing = 1;
- }
- }
- }
+ sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
/* create a new bios memory map, removing overlaps */
overlap_entries = 0; /* number of entries in the overlap table */
@@ -769,7 +758,7 @@ void __init e820_mark_nosave_regions(uns
}
#endif
-#ifdef CONFIG_HIBERNATION
+#ifdef CONFIG_ACPI
/**
* Mark ACPI NVS memory region, so that we can save/restore it during
* hibernation and the subsequent resume.
@@ -782,7 +771,7 @@ static int __init e820_mark_nvs_memory(v
struct e820entry *ei = &e820.map[i];
if (ei->type == E820_NVS)
- suspend_nvs_register(ei->addr, ei->size);
+ acpi_nvs_register(ei->addr, ei->size);
}
return 0;
@@ -795,47 +784,29 @@ core_initcall(e820_mark_nvs_memory);
/*
* pre allocated 4k and reserved it in memblock and e820_saved
*/
-u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+u64 __init early_reserve_e820(u64 size, u64 align)
{
- u64 size = 0;
u64 addr;
- u64 start;
#ifdef CONFIG_XEN
- unsigned int order = get_order(sizet);
+ unsigned int order = get_order(size);
int rc;
unsigned long max_initmap_pfn;
if (!is_initial_xendomain())
return 0;
- sizet = PAGE_SIZE << order;
+ size = PAGE_SIZE << order;
if (align < PAGE_SIZE)
align = PAGE_SIZE;
#endif
- for (start = startt; ; start += size) {
- start = memblock_x86_find_in_range_size(start, &size, align);
- if (start == MEMBLOCK_ERROR)
- return 0;
- if (size >= sizet)
- break;
+ addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (addr) {
+ e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
+ printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+ update_e820_saved();
}
-
-#ifdef CONFIG_X86_32
- if (start >= MAXMEM)
- return 0;
- if (start + size > MAXMEM)
- size = MAXMEM - start;
-#endif
#ifdef CONFIG_XEN
- if ((start >> PAGE_SHIFT) >= xen_start_info->nr_pages)
- return 0;
- if (PFN_UP(start + size) > xen_start_info->nr_pages)
- size = ((u64)xen_start_info->nr_pages << PAGE_SHIFT) - start;
-#endif
-
- addr = round_down(start + size - sizet, align);
- if (addr < start)
+ else
return 0;
-#ifdef CONFIG_XEN
max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base))
+ xen_start_info->nr_pt_frames
+ 1 + (1 << (19 - PAGE_SHIFT)),
@@ -859,10 +830,6 @@ u64 __init early_reserve_e820(u64 startt
if (rc)
return 0;
#endif
- memblock_x86_reserve_range(addr, addr + sizet, "new next");
- e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
- printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
- update_e820_saved();
return addr;
}
@@ -1223,7 +1190,7 @@ void __init memblock_x86_fill(void)
* We are safe to enable resizing, beause memblock_x86_fill()
* is rather later for x86
*/
- memblock_can_resize = 1;
+ memblock_allow_resize();
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
@@ -1238,22 +1205,42 @@ void __init memblock_x86_fill(void)
memblock_add(ei->addr, ei->size);
}
- memblock_analyze();
+#ifdef CONFIG_XEN
+ if (max_pfn > xen_start_info->nr_pages)
+ memblock_reserve(PFN_PHYS(xen_start_info->nr_pages),
+ PFN_PHYS(max_pfn - xen_start_info->nr_pages));
+#endif
+
memblock_dump_all();
}
void __init memblock_find_dma_reserve(void)
{
#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
- u64 free_size_pfn;
- u64 mem_size_pfn;
+ u64 nr_pages = 0, nr_free_pages = 0;
+ unsigned long start_pfn, end_pfn;
+ phys_addr_t start, end;
+ int i;
+ u64 u;
+
/*
* need to find out used area below MAX_DMA_PFN
* need to use memblock to get free size in [0, MAX_DMA_PFN]
* at first, and assume boot_mem will not take below MAX_DMA_PFN
*/
- mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
- free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
- set_dma_reserve(mem_size_pfn - free_size_pfn);
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN);
+ end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN);
+ nr_pages += end_pfn - start_pfn;
+ }
+
+ for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+ start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
+ end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
+ if (start_pfn < end_pfn)
+ nr_free_pages += end_pfn - start_pfn;
+ }
+
+ set_dma_reserve(nr_pages - nr_free_pages);
#endif
}
--- head.orig/arch/x86/kernel/early_printk-xen.c 2011-02-01 15:41:35.000000000 +0100
+++ head/arch/x86/kernel/early_printk-xen.c 2012-02-09 12:32:50.000000000 +0100
@@ -272,14 +272,14 @@ static int __init setup_early_printk(cha
if (!strncmp(buf, "xen", 3))
early_console_register(&xenboot_console, keep);
#endif
-#ifdef CONFIG_EARLY_PRINTK_MRST
+#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
if (!strncmp(buf, "mrst", 4)) {
mrst_early_console_init();
early_console_register(&early_mrst_console, keep);
}
if (!strncmp(buf, "hsu", 3)) {
- hsu_early_console_init();
+ hsu_early_console_init(buf + 3);
early_console_register(&early_hsu_console, keep);
}
#endif
--- head.orig/arch/x86/kernel/entry_32-xen.S 2013-01-30 11:56:41.000000000 +0100
+++ head/arch/x86/kernel/entry_32-xen.S 2013-01-30 11:57:33.000000000 +0100
@@ -42,6 +42,7 @@
*/
#include <linux/linkage.h>
+#include <linux/err.h>
#include <asm/thread_info.h>
#include <asm/irqflags.h>
#include <asm/errno.h>
@@ -82,8 +83,6 @@
* enough to patch inline, increasing performance.
*/
-#define nr_syscalls ((syscall_table_size)/4)
-
/* Pseudo-eflags. */
NMI_MASK = 0x80000000
@@ -427,7 +426,7 @@ sysenter_past_esp:
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz sysenter_audit
sysenter_do_call:
- cmpl $(nr_syscalls), %eax
+ cmpl $(NR_syscalls), %eax
jae syscall_badsys
call *sys_call_table(,%eax,4)
movl %eax,PT_EAX(%esp)
@@ -459,7 +458,7 @@ sysenter_audit:
movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
movl %eax,%edx /* 2nd arg: syscall number */
movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
- call audit_syscall_entry
+ call __audit_syscall_entry
pushl_cfi %ebx
movl PT_EAX(%esp),%eax /* reload syscall number */
jmp sysenter_do_call
@@ -470,11 +469,10 @@ sysexit_audit:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
movl %eax,%edx /* second arg, syscall return value */
- cmpl $0,%eax /* is it < 0? */
- setl %al /* 1 if so, 0 if not */
+ cmpl $-MAX_ERRNO,%eax /* is it an error ? */
+ setbe %al /* 1 if so, 0 if not */
movzbl %al,%eax /* zero-extend that */
- inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
- call audit_syscall_exit
+ call __audit_syscall_exit
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
@@ -533,7 +531,7 @@ ENTRY(system_call)
# system call tracing in operation / emulation
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz syscall_trace_entry
- cmpl $(nr_syscalls), %eax
+ cmpl $(NR_syscalls), %eax
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
@@ -694,6 +692,8 @@ work_notifysig: # deal with pending s
movl %esp, %eax
jne work_notifysig_v86 # returning to kernel-space or
# vm86-space
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace_sig
@@ -707,6 +707,8 @@ work_notifysig_v86:
#else
movl %esp, %eax
#endif
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace_sig
@@ -719,7 +721,7 @@ syscall_trace_entry:
movl %esp, %eax
call syscall_trace_enter
/* What it returned is what we'll actually use. */
- cmpl $(nr_syscalls), %eax
+ cmpl $(NR_syscalls), %eax
jnae syscall_call
jmp syscall_exit
END(syscall_trace_entry)
@@ -759,29 +761,28 @@ END(syscall_badsys)
* System calls that need a pt_regs pointer.
*/
#define PTREGSCALL0(name) \
- ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
leal 4(%esp),%eax; \
- jmp sys_##name;
+ jmp sys_##name; \
+ENDPROC(ptregs_##name)
#define PTREGSCALL1(name) \
- ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
leal 4(%esp),%edx; \
movl (PT_EBX+4)(%esp),%eax; \
- jmp sys_##name;
+ jmp sys_##name; \
+ENDPROC(ptregs_##name)
#define PTREGSCALL2(name) \
- ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
leal 4(%esp),%ecx; \
movl (PT_ECX+4)(%esp),%edx; \
movl (PT_EBX+4)(%esp),%eax; \
- jmp sys_##name;
+ jmp sys_##name; \
+ENDPROC(ptregs_##name)
#define PTREGSCALL3(name) \
- ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
CFI_STARTPROC; \
leal 4(%esp),%eax; \
pushl_cfi %eax; \
@@ -806,8 +807,7 @@ PTREGSCALL2(vm86)
PTREGSCALL1(vm86old)
/* Clone is an oddball. The 4th arg is in %edi */
- ALIGN;
-ptregs_clone:
+ENTRY(ptregs_clone)
CFI_STARTPROC
leal 4(%esp),%eax
pushl_cfi %eax
@@ -1363,7 +1363,7 @@ ENTRY(ia32pv_cstar_target)
GET_THREAD_INFO(%ebp)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz cstar_trace_entry
- cmpl $nr_syscalls,%eax
+ cmpl $NR_syscalls,%eax
jae cstar_badsys
.Lcstar_call:
btl %eax,cstar_special
@@ -1378,7 +1378,7 @@ ENTRY(ia32pv_cstar_target)
movl PT_ECX(%esp),%ecx
movl %ecx,PT_EBP(%esp) # put user EBP back in place
jmp syscall_call
-cstar_set_tif:
+GLOBAL(cstar_set_tif)
movl $cstar_clear_tif,(%esp) # replace return address
LOCK_PREFIX
orl $_TIF_CSTAR,TI_flags(%ebp)
@@ -1390,7 +1390,7 @@ cstar_clear_tif:
jmp .Lcstar_exit
cstar_trace_entry:
movl $-ENOSYS,PT_EAX(%esp)
- cmpl $nr_syscalls,%eax
+ cmpl $NR_syscalls,%eax
jae 1f
btl %eax,cstar_special
jc .Lcstar_trace_special
@@ -1401,7 +1401,7 @@ cstar_trace_entry:
LOCK_PREFIX
andl $~_TIF_CSTAR,TI_flags(%ebp)
/* What it returned is what we'll actually use. */
- cmpl $nr_syscalls,%eax
+ cmpl $NR_syscalls,%eax
jb .Lcstar_call
jmp .Lcstar_exit
.Lcstar_trace_special:
@@ -1410,7 +1410,7 @@ cstar_trace_entry:
movl %ecx,PT_EBP(%esp) # put user EBP back in place
call syscall_trace_enter
/* What it returned is what we'll actually use. */
- cmpl $nr_syscalls,%eax
+ cmpl $NR_syscalls,%eax
jb syscall_call
jmp syscall_exit
cstar_badsys:
@@ -1438,19 +1438,14 @@ ENTRY(cstar_ret_from_fork)
jmp ret_from_fork
CFI_ENDPROC
END(cstar_ret_from_fork)
-#endif /* TIF_CSTAR */
-
-.section .rodata,"a"
-#include "syscall_table_32.S"
-syscall_table_size=(.-sys_call_table)
-
-#ifdef TIF_CSTAR
#include <asm/unistd.h>
+.pushsection .rodata,"a"
+.balign 4
cstar_special:
nr=0
mask=0
-.rept nr_syscalls+31
+.rept NR_syscalls+31
.irp n, __NR_sigreturn, __NR_rt_sigreturn
.if nr == \n
mask = mask | (1 << (\n & 31))
@@ -1462,15 +1457,7 @@ mask=0
mask = 0
.endif
.endr
-#define sys_call_table cstar_call_table
-#define ptregs_fork cstar_set_tif
-#define ptregs_clone cstar_set_tif
-#define ptregs_vfork cstar_set_tif
-#include "syscall_table_32.S"
-#undef sys_call_table
-#undef ptregs_fork
-#undef ptregs_clone
-#undef ptregs_vfork
+.popsection
#endif /* TIF_CSTAR */
/*
--- head.orig/arch/x86/kernel/entry_64-xen.S 2011-11-17 15:56:06.000000000 +0100
+++ head/arch/x86/kernel/entry_64-xen.S 2013-05-24 08:26:15.000000000 +0200
@@ -58,6 +58,7 @@
#include <asm/processor-flags.h>
#include <asm/ftrace.h>
#include <asm/percpu.h>
+#include <linux/err.h>
#include <xen/interface/xen.h>
#include <xen/interface/features.h>
@@ -212,7 +213,7 @@ NMI_MASK = 0x80000000
/*CFI_REL_OFFSET ss,0*/
pushq_cfi %rax /* rsp */
CFI_REL_OFFSET rsp,0
- pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
+ pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
/*CFI_REL_OFFSET rflags,0*/
pushq_cfi $__KERNEL_CS /* cs */
/*CFI_REL_OFFSET cs,0*/
@@ -448,8 +449,11 @@ ENTRY(ret_from_fork)
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
- je int_ret_from_sys_call
-
+ jnz 1f
+ /* Need to set the proper %ss (not NULL) for ring 3 iretq */
+ movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
+ jmp retint_restore_args
+1:
testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
jnz int_ret_from_sys_call
@@ -494,8 +498,7 @@ ENTRY(system_call)
INTR_FRAME start=2 offset=2*8
SAVE_ARGS -8,0
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
- GET_THREAD_INFO(%rcx)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz tracesys
system_call_fastpath:
cmpq $__NR_syscall_max,%rax
@@ -512,10 +515,9 @@ ret_from_sys_call:
/* edi: flagmask */
sysret_check:
LOCKDEP_SYS_EXIT
- GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl TI_flags(%rcx),%edx
+ movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
andl %edi,%edx
jnz sysret_careful
CFI_REMEMBER_STATE
@@ -564,7 +566,7 @@ badsys:
#ifdef CONFIG_AUDITSYSCALL
/*
* Fast path for syscall audit without full syscall trace.
- * We just call audit_syscall_entry() directly, and then
+ * We just call __audit_syscall_entry() directly, and then
* jump back to the normal fast path.
*/
auditsys:
@@ -574,22 +576,21 @@ auditsys:
movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
movq %rax,%rsi /* 2nd arg: syscall number */
movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
- call audit_syscall_entry
+ call __audit_syscall_entry
LOAD_ARGS 0 /* reload call-clobbered registers */
jmp system_call_fastpath
/*
- * Return fast path for syscall audit. Call audit_syscall_exit()
+ * Return fast path for syscall audit. Call __audit_syscall_exit()
* directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
* masked off.
*/
sysret_audit:
movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
- cmpq $0,%rsi /* is it < 0? */
- setl %al /* 1 if so, 0 if not */
+ cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
+ setbe %al /* 1 if so, 0 if not */
movzbl %al,%edi /* zero-extend that into %edi */
- inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
- call audit_syscall_exit
+ call __audit_syscall_exit
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
jmp sysret_check
#endif /* CONFIG_AUDITSYSCALL */
@@ -597,7 +598,7 @@ sysret_audit:
/* Do syscall tracing */
tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz auditsys
#endif
SAVE_REST
@@ -626,12 +627,6 @@ tracesys:
GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testb $3,CS-ARGOFFSET(%rsp)
- jnz 1f
- /* Need to set the proper %ss (not NULL) for ring 3 iretq */
- movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
- jmp retint_restore_args # retrun from ring3 kernel
-1:
movl $_TIF_ALLWORK_MASK,%edi
/* edi: mask to check */
GLOBAL(int_with_check)
@@ -1236,13 +1231,28 @@ ENTRY(error_exit)
END(error_exit)
+#define extern #
+#include <asm-generic/percpu.h>
+
+.pushsection PER_CPU_BASE_SECTION, "aw", @progbits
+in_NMI: .byte 0
+.popsection
+
do_nmi_callback:
CFI_STARTPROC
addq $8, %rsp
CFI_ENDPROC
DEFAULT_FRAME
+ orb $1, PER_CPU_VAR(in_NMI)
+ js 1f
+0:
+ movb $0x80, PER_CPU_VAR(in_NMI)
call do_nmi
+ movl $0x80, %eax
+ cmpxchgb %ah, PER_CPU_VAR(in_NMI)
+ jne 0b
orl $NMI_MASK,EFLAGS(%rsp)
+1:
RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
--- head.orig/arch/x86/kernel/head-xen.c 2012-02-08 16:16:55.000000000 +0100
+++ head/arch/x86/kernel/head-xen.c 2013-04-05 09:22:12.000000000 +0200
@@ -54,7 +54,7 @@ void __init reserve_ebda_region(void)
lowmem = 0x9f000;
/* reserve all memory between lowmem and the 1MB mark */
- memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
+ memblock_reserve(lowmem, 0x100000 - lowmem);
}
#else /* CONFIG_XEN */
#include <linux/export.h>
@@ -106,11 +106,10 @@ void __init xen_start_kernel(void)
WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
VMASST_TYPE_writable_pagetables));
- memblock_init();
- memblock_x86_reserve_range(PAGE_ALIGN(__pa_symbol(&_end)),
- __pa(xen_start_info->pt_base)
- + PFN_PHYS(xen_start_info->nr_pt_frames),
- "Xen provided");
+ memblock_reserve(PAGE_ALIGN(__pa_symbol(&_end)),
+ __pa(xen_start_info->pt_base)
+ + PFN_PHYS(xen_start_info->nr_pt_frames)
+ - PAGE_ALIGN(__pa_symbol(&_end)));
x86_configure_nx();
--- head.orig/arch/x86/kernel/head32-xen.c 2011-07-01 15:19:34.000000000 +0200
+++ head/arch/x86/kernel/head32-xen.c 2012-02-09 12:32:50.000000000 +0100
@@ -47,9 +47,8 @@ void __init i386_start_kernel(void)
BUG_ON(pte_index(hypervisor_virt_start));
#endif
- memblock_init();
-
- memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ memblock_reserve(__pa_symbol(&_text),
+ __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
#ifndef CONFIG_XEN
#ifdef CONFIG_BLK_DEV_INITRD
@@ -59,7 +58,7 @@ void __init i386_start_kernel(void)
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
+ memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
}
#endif
--- head.orig/arch/x86/kernel/head64-xen.c 2011-04-12 15:59:10.000000000 +0200
+++ head/arch/x86/kernel/head64-xen.c 2012-02-09 12:32:50.000000000 +0100
@@ -117,9 +117,8 @@ void __init x86_64_start_reservations(ch
{
copy_bootdata(__va(real_mode_data));
- memblock_init();
-
- memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ memblock_reserve(__pa_symbol(&_text),
+ __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
/*
* At this point everything still needed from the boot loader
--- head.orig/arch/x86/kernel/irq-xen.c 2011-11-17 15:56:06.000000000 +0100
+++ head/arch/x86/kernel/irq-xen.c 2013-05-24 10:37:34.000000000 +0200
@@ -78,6 +78,12 @@ int arch_show_interrupts(struct seq_file
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
seq_printf(p, " IRQ work interrupts\n");
+#ifndef CONFIG_XEN
+ seq_printf(p, "%*s: ", prec, "RTR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
+ seq_printf(p, " APIC ICR read retries\n");
+#endif
#endif
#ifndef CONFIG_XEN
if (x86_platform_ipi_callback) {
@@ -149,6 +155,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->irq_spurious_count;
sum += irq_stats(cpu)->apic_perf_irqs;
sum += irq_stats(cpu)->apic_irq_work_irqs;
+ sum += irq_stats(cpu)->icr_read_retry_count;
#endif
#ifndef CONFIG_XEN
if (x86_platform_ipi_callback)
@@ -200,8 +207,8 @@ unsigned int __irq_entry do_IRQ(struct p
unsigned vector = ~regs->orig_ax;
unsigned irq;
- exit_idle();
irq_enter();
+ exit_idle();
irq = __this_cpu_read(vector_irq[vector]);
@@ -228,10 +235,10 @@ void smp_x86_platform_ipi(struct pt_regs
ack_APIC_irq();
- exit_idle();
-
irq_enter();
+ exit_idle();
+
inc_irq_stat(x86_platform_ipis);
if (x86_platform_ipi_callback)
--- head.orig/arch/x86/kernel/irq_64.c 2014-05-06 08:31:14.000000000 +0200
+++ head/arch/x86/kernel/irq_64.c 2012-05-11 10:57:43.000000000 +0200
@@ -39,7 +39,9 @@ static inline void stack_overflow_check(
{
#ifdef CONFIG_DEBUG_STACKOVERFLOW
#define STACK_TOP_MARGIN 128
+#ifndef CONFIG_X86_NO_TSS
struct orig_ist *oist;
+#endif
u64 irq_stack_top, irq_stack_bottom;
u64 estack_top, estack_bottom;
u64 curbase = (u64)task_stack_page(current);
@@ -58,11 +60,15 @@ static inline void stack_overflow_check(
if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
return;
+#ifndef CONFIG_X86_NO_TSS
oist = &__get_cpu_var(orig_ist);
estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
if (regs->sp >= estack_top && regs->sp <= estack_bottom)
return;
+#else
+ estack_top = estack_bottom = 0;
+#endif
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
current->comm, curbase, regs->sp,
--- head.orig/arch/x86/kernel/cpu/microcode/core-xen.c 2011-12-01 15:28:13.000000000 +0100
+++ head/arch/x86/kernel/cpu/microcode/core-xen.c 2012-02-09 14:22:00.000000000 +0100
@@ -186,16 +186,21 @@ static int request_microcode(const char
static int __init microcode_init(void)
{
const struct cpuinfo_x86 *c = &boot_cpu_data;
- char buf[32];
+ char buf[36];
const char *fw_name = buf;
int error;
if (c->x86_vendor == X86_VENDOR_INTEL)
snprintf(buf, sizeof(buf), "intel-ucode/%02x-%02x-%02x",
c->x86, c->x86_model, c->x86_mask);
- else if (c->x86_vendor == X86_VENDOR_AMD)
- fw_name = "amd-ucode/microcode_amd.bin";
- else {
+ else if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (c->x86 >= 0x15)
+ snprintf(buf, sizeof(buf),
+ "amd-ucode/microcode_amd_fam%xh.bin",
+ c->x86);
+ else
+ fw_name = "amd-ucode/microcode_amd.bin";
+ } else {
pr_err("no support for this CPU vendor\n");
return -ENODEV;
}
--- head.orig/arch/x86/kernel/mpparse-xen.c 2011-12-21 11:56:23.000000000 +0100
+++ head/arch/x86/kernel/mpparse-xen.c 2012-02-09 12:32:50.000000000 +0100
@@ -591,9 +591,7 @@ void __init default_get_smp_config(unsig
#ifndef CONFIG_XEN
static void __init smp_reserve_memory(struct mpf_intel *mpf)
{
- unsigned long size = get_mpc_size(mpf->physptr);
-
- memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
+ memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
}
#endif
@@ -626,7 +624,7 @@ static int __init smp_scan_config(unsign
mpf, (u64)virt_to_phys(mpf));
mem = virt_to_phys(mpf);
- memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
+ memblock_reserve(mem, sizeof(*mpf));
if (mpf->physptr)
smp_reserve_memory(mpf);
#else
@@ -874,10 +872,8 @@ early_param("alloc_mptable", parse_alloc
void __init early_reserve_e820_mpc_new(void)
{
- if (enable_update_mptable && alloc_mptable) {
- u64 startt = 0;
- mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
- }
+ if (enable_update_mptable && alloc_mptable)
+ mpc_new_phys = early_reserve_e820(mpc_new_length, 4);
}
static int __init update_mp_table(void)
--- head.orig/arch/x86/kernel/pci-dma-xen.c 2012-04-04 14:32:31.000000000 +0200
+++ head/arch/x86/kernel/pci-dma-xen.c 2012-04-04 14:32:53.000000000 +0200
@@ -42,6 +42,15 @@ int iommu_detected __read_mostly = 0;
* guests and not for driver dma translation.
*/
int iommu_pass_through __read_mostly;
+
+/*
+ * Group multi-function PCI devices into a single device-group for the
+ * iommu_device_group interface. This tells the iommu driver to pretend
+ * it cannot distinguish between functions of a device, exposing only one
+ * group for the device. Useful for disallowing use of individual PCI
+ * functions from userspace drivers.
+ */
+int iommu_group_mf __read_mostly;
#endif
extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
@@ -233,6 +242,8 @@ static __init int iommu_setup(char *p)
#ifndef CONFIG_XEN
if (!strncmp(p, "pt", 2))
iommu_pass_through = 1;
+ if (!strncmp(p, "group_mf", 8))
+ iommu_group_mf = 1;
gart_parse_options(p);
#endif
--- head.orig/arch/x86/kernel/process-xen.c 2011-12-21 11:59:08.000000000 +0100
+++ head/arch/x86/kernel/process-xen.c 2012-02-09 12:32:50.000000000 +0100
@@ -280,7 +280,7 @@ int kernel_thread(int (*fn)(void *), voi
regs.orig_ax = -1;
regs.ip = (unsigned long) kernel_thread_helper;
regs.cs = __KERNEL_CS | get_kernel_rpl();
- regs.flags = X86_EFLAGS_IF | 0x2;
+ regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
/* Ok, create the new process.. */
return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL);
--- head.orig/arch/x86/kernel/process_32-xen.c 2012-02-29 14:20:36.000000000 +0100
+++ head/arch/x86/kernel/process_32-xen.c 2012-08-10 10:47:29.000000000 +0200
@@ -102,7 +102,8 @@ void cpu_idle(void)
/* endless idle loop with no priority at all */
while (1) {
- tick_nohz_stop_sched_tick(1);
+ tick_nohz_idle_enter();
+ rcu_idle_enter();
while (!need_resched()) {
check_pgt_cache();
@@ -119,7 +120,8 @@ void cpu_idle(void)
xen_idle();
start_critical_timings();
}
- tick_nohz_restart_sched_tick();
+ rcu_idle_exit();
+ tick_nohz_idle_exit();
preempt_enable_no_resched();
schedule();
preempt_disable();
@@ -215,6 +217,7 @@ int copy_thread(unsigned long clone_flag
task_user_gs(p) = get_user_gs(regs);
+ p->fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
tsk = current;
err = -ENOMEM;
@@ -303,11 +306,11 @@ __switch_to(struct task_struct *prev_p,
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
- int cpu = smp_processor_id(), cr0_ts = 0;
+ int cpu = smp_processor_id(), cr0_ts;
#ifndef CONFIG_X86_NO_TSS
struct tss_struct *tss = &per_cpu(init_tss, cpu);
#endif
- bool preload_fpu;
+ fpu_switch_t fpu;
#if CONFIG_XEN_COMPAT > 0x030002
struct physdev_set_iopl iopl_op;
struct physdev_set_iobitmap iobmp_op;
@@ -320,26 +323,7 @@ __switch_to(struct task_struct *prev_p,
/* XEN NOTE: FS/GS saved in switch_mm(), not here. */
- /*
- * If the task has used fpu the last 5 timeslices, just do a full
- * restore of the math state immediately to avoid the trap; the
- * chances of needing FPU soon are obviously high now
- */
- preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
-
- /*
- * This is basically '__unlazy_fpu', except that we queue a
- * multicall to indicate FPU task switch, rather than
- * synchronously trapping to Xen.
- */
- if (task_thread_info(prev_p)->status & TS_USEDFPU) {
- __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
- if (!preload_fpu) {
- mcl->op = __HYPERVISOR_fpu_taskswitch;
- mcl++->args[0] = 1;
- cr0_ts = 1;
- }
- }
+ fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl);
/*
* Reload sp0.
@@ -381,14 +365,6 @@ __switch_to(struct task_struct *prev_p,
mcl++;
}
- /* If we're going to preload the fpu context, make sure clts
- is run while we're batching the cpu state updates. */
- if (preload_fpu) {
- mcl->op = __HYPERVISOR_fpu_taskswitch;
- mcl++->args[0] = 0;
- cr0_ts = -1;
- }
-
if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
set_xen_guest_handle(iobmp_op.bitmap,
(char *)next->io_bitmap_ptr);
@@ -409,8 +385,11 @@ __switch_to(struct task_struct *prev_p,
BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
#endif
BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
- if (cr0_ts)
+ if (_mcl->op == __HYPERVISOR_fpu_taskswitch) {
percpu_write(xen_x86_cr0_upd, X86_CR0_TS);
+ cr0_ts = _mcl->args[0] ? 1 : -1;
+ } else
+ cr0_ts = 0;
if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
BUG();
if (cr0_ts) {
@@ -421,10 +400,6 @@ __switch_to(struct task_struct *prev_p,
xen_clear_cr0_upd();
}
- /* we're going to use this soon, after a few expensive things */
- if (preload_fpu)
- prefetch(next->fpu.state);
-
/*
* Now maybe handle debug registers
*/
@@ -441,15 +416,14 @@ __switch_to(struct task_struct *prev_p,
*/
arch_end_context_switch(next_p);
- if (preload_fpu)
- __math_state_restore();
-
/*
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
lazy_load_gs(next->gs);
+ switch_fpu_finish(next_p, fpu);
+
percpu_write(current_task, next_p);
return prev_p;
--- head.orig/arch/x86/kernel/process_64-xen.c 2011-11-17 15:56:06.000000000 +0100
+++ head/arch/x86/kernel/process_64-xen.c 2012-08-01 12:12:51.000000000 +0200
@@ -126,7 +126,7 @@ void cpu_idle(void)
/* endless idle loop with no priority at all */
while (1) {
- tick_nohz_stop_sched_tick(1);
+ tick_nohz_idle_enter();
while (!need_resched()) {
rmb();
@@ -143,8 +143,14 @@ void cpu_idle(void)
enter_idle();
/* Don't trace irqs off for idle */
stop_critical_timings();
+
+ /* enter_idle() needs rcu for notifiers */
+ rcu_idle_enter();
+
if (cpuidle_idle_call())
xen_idle();
+
+ rcu_idle_exit();
start_critical_timings();
/* In many cases the interrupt that ended idle
@@ -153,7 +159,7 @@ void cpu_idle(void)
__exit_idle();
}
- tick_nohz_restart_sched_tick();
+ tick_nohz_idle_exit();
preempt_enable_no_resched();
schedule();
preempt_disable();
@@ -289,6 +295,7 @@ int copy_thread(unsigned long clone_flag
set_tsk_thread_flag(p, TIF_FORK);
+ p->fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
savesegment(gs, p->thread.gsindex);
@@ -302,13 +309,12 @@ int copy_thread(unsigned long clone_flag
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
- p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+ p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
return -ENOMEM;
}
- memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
- IO_BITMAP_BYTES);
set_tsk_thread_flag(p, TIF_IO_BITMAP);
}
@@ -392,7 +398,7 @@ __switch_to(struct task_struct *prev_p,
#ifndef CONFIG_X86_NO_TSS
struct tss_struct *tss = &per_cpu(init_tss, cpu);
#endif
- bool preload_fpu;
+ fpu_switch_t fpu;
#if CONFIG_XEN_COMPAT > 0x030002
struct physdev_set_iopl iopl_op;
struct physdev_set_iobitmap iobmp_op;
@@ -403,38 +409,7 @@ __switch_to(struct task_struct *prev_p,
#endif
multicall_entry_t _mcl[8], *mcl = _mcl;
- /*
- * If the task has used fpu the last 5 timeslices, just do a full
- * restore of the math state immediately to avoid the trap; the
- * chances of needing FPU soon are obviously high now
- */
- preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
-
- /* we're going to use this soon, after a few expensive things */
- if (preload_fpu)
- prefetch(next->fpu.state);
-
- /*
- * This is basically '__unlazy_fpu', except that we queue a
- * multicall to indicate FPU task switch, rather than
- * synchronously trapping to Xen.
- * The AMD workaround requires it to be after DS reload, or
- * after DS has been cleared, which we do in __prepare_arch_switch.
- */
- if (task_thread_info(prev_p)->status & TS_USEDFPU) {
- __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
- if (!preload_fpu) {
- mcl->op = __HYPERVISOR_fpu_taskswitch;
- mcl++->args[0] = 1;
- }
- } else
- prev_p->fpu_counter = 0;
-
- /* Make sure cpu is ready for new context */
- if (preload_fpu) {
- mcl->op = __HYPERVISOR_fpu_taskswitch;
- mcl++->args[0] = 0;
- }
+ fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl);
/*
* Reload sp0.
@@ -549,6 +524,8 @@ __switch_to(struct task_struct *prev_p,
if (next->gs)
WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
+ switch_fpu_finish(next_p, fpu);
+
/*
* Switch the PDA context.
*/
@@ -565,13 +542,6 @@ __switch_to(struct task_struct *prev_p,
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p);
- /*
- * Preload the FPU context, now that we've determined that the
- * task is likely to be using it.
- */
- if (preload_fpu)
- __math_state_restore();
-
return prev_p;
}
--- head.orig/arch/x86/kernel/setup-xen.c 2013-12-06 15:08:16.000000000 +0100
+++ head/arch/x86/kernel/setup-xen.c 2013-12-06 15:08:23.000000000 +0100
@@ -341,7 +341,8 @@ static void __init cleanup_highmap(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
+ memblock_reserve(__pa(_brk_start),
+ __pa(_brk_end) - __pa(_brk_start));
/* Mark brk area as locked down and no longer taking any
new allocations */
@@ -367,13 +368,13 @@ static void __init relocate_initrd(void)
ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
PAGE_SIZE);
- if (ramdisk_here == MEMBLOCK_ERROR)
+ if (!ramdisk_here)
panic("Cannot find place for new RAMDISK of size %lld\n",
ramdisk_size);
/* Note: this includes all the lowmem currently occupied by
the initrd, we rely on that fact to keep the data intact. */
- memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
+ memblock_reserve(ramdisk_here, area_size);
initrd_start = ramdisk_here + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -446,7 +447,7 @@ static void __init reserve_initrd(void)
initrd_start = 0;
if (ramdisk_size >= (end_of_lowmem>>1)) {
- memblock_x86_free_range(ramdisk_image, ramdisk_end);
+ memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
printk(KERN_ERR "initrd too large to handle, "
"disabling initrd\n");
return;
@@ -472,7 +473,7 @@ static void __init reserve_initrd(void)
relocate_initrd();
- memblock_x86_free_range(ramdisk_image, ramdisk_end);
+ memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
#else
static void __init reserve_initrd(void)
@@ -551,15 +552,13 @@ static void __init memblock_x86_reserve_
#ifndef CONFIG_XEN
struct setup_data *data;
u64 pa_data;
- char buf[32];
if (boot_params.hdr.version < 0x0209)
return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
- sprintf(buf, "setup data %x", data->type);
- memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ memblock_reserve(pa_data, sizeof(*data) + data->len);
pa_data = data->next;
early_iounmap(data, sizeof(*data));
}
@@ -616,7 +615,7 @@ static void __init reserve_crashkernel(v
crash_base = memblock_find_in_range(alignment,
CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
- if (crash_base == MEMBLOCK_ERROR) {
+ if (!crash_base) {
pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
}
@@ -630,7 +629,7 @@ static void __init reserve_crashkernel(v
return;
}
}
- memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
+ memblock_reserve(crash_base, crash_size);
printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
"for crashkernel (System RAM: %ldMB)\n",
@@ -693,7 +692,7 @@ static __init void reserve_ibft_region(v
#ifndef CONFIG_XEN
if (size)
- memblock_x86_reserve_range(addr, addr + size, "* ibft");
+ memblock_reserve(addr, size);
#endif
}
@@ -841,12 +840,7 @@ void __init setup_arch(char **cmdline_p)
#endif
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-#ifdef CONFIG_X86_32
- "EL32",
-#else
- "EL64",
-#endif
- 4)) {
+ EFI_LOADER_SIGNATURE, 4)) {
efi_enabled = 1;
efi_memblock_x86_reserve_range();
}
--- head.orig/arch/x86/kernel/smp-xen.c 2011-07-01 15:47:44.000000000 +0200
+++ head/arch/x86/kernel/smp-xen.c 2012-02-16 17:53:11.000000000 +0100
@@ -28,6 +28,7 @@
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/ipi.h>
+#include <asm/nmi.h>
#include <xen/evtchn.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
@@ -132,6 +133,20 @@ void xen_send_call_func_ipi(const struct
xen_send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR);
}
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+static bool __read_mostly xen_smp_disable_nmi_ipi;
+
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+ /* We are registered on stopping cpu too, avoid spurious NMI */
+ if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+ return NMI_HANDLED;
+
+ stop_this_cpu(NULL);
+
+ return NMI_HANDLED;
+}
+
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
@@ -158,7 +173,27 @@ void xen_stop_other_cpus(int wait)
* currently)
*/
if (num_online_cpus() > 1) {
- xen_send_IPI_allbutself(REBOOT_VECTOR);
+ unsigned int vector = REBOOT_VECTOR;
+
+ if (!xen_smp_disable_nmi_ipi) {
+ /* did someone beat us here? */
+ if (atomic_cmpxchg(&stopping_cpu, -1,
+ safe_smp_processor_id()) != -1)
+ return;
+
+ if (register_nmi_handler(NMI_LOCAL,
+ smp_stop_nmi_callback,
+ NMI_FLAG_FIRST, "smp_stop"))
+ /* Note: we ignore failures here */
+ return;
+
+ /* sync above data before sending NMI */
+ wmb();
+
+ vector = NMI_VECTOR;
+ }
+
+ xen_send_IPI_allbutself(vector);
/*
* Don't wait longer than a second if the caller
@@ -199,3 +234,11 @@ irqreturn_t smp_call_function_single_int
return IRQ_HANDLED;
}
+
+static int __init nonmi_ipi_setup(char *str)
+{
+ xen_smp_disable_nmi_ipi = true;
+ return 1;
+}
+
+__setup("nonmi_ipi", nonmi_ipi_setup);
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head/arch/x86/kernel/syscall_32-xen.c 2012-02-29 14:36:52.000000000 +0100
@@ -0,0 +1,20 @@
+#include "syscall_32.c"
+
+#include <linux/thread_info.h>
+
+#ifdef TIF_CSTAR
+extern asmlinkage void cstar_set_tif(void);
+
+#define ptregs_fork cstar_set_tif
+#define ptregs_clone cstar_set_tif
+#define ptregs_vfork cstar_set_tif
+
+const sys_call_ptr_t cstar_call_table[__NR_syscall_max+1] = {
+ /*
+ * Smells like a compiler bug -- it doesn't work
+ * when the & below is removed.
+ */
+ [0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_32.h>
+};
+#endif /* TIF_CSTAR */
--- head.orig/arch/x86/kernel/traps-xen.c 2011-11-17 16:50:15.000000000 +0100
+++ head/arch/x86/kernel/traps-xen.c 2013-11-07 12:36:39.000000000 +0100
@@ -310,19 +310,20 @@ dotraplinkage void __kprobes do_int3(str
== NOTIFY_STOP)
return;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
-#ifdef CONFIG_KPROBES
+
if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
== NOTIFY_STOP)
return;
-#else
- if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
- return;
-#endif
+ /*
+ * Let others (NMI) know that the debug stack is in use
+ * as we may switch to the interrupt stack.
+ */
+ debug_stack_usage_inc();
preempt_conditional_sti(regs);
do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
preempt_conditional_cli(regs);
+ debug_stack_usage_dec();
}
#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
@@ -415,6 +416,12 @@ dotraplinkage void __kprobes do_debug(st
SIGTRAP) == NOTIFY_STOP)
return;
+ /*
+ * Let others (NMI) know that the debug stack is in use
+ * as we may switch to the interrupt stack.
+ */
+ debug_stack_usage_inc();
+
/* It's safe to allow irq's after DR6 has been saved */
preempt_conditional_sti(regs);
@@ -422,6 +429,7 @@ dotraplinkage void __kprobes do_debug(st
handle_vm86_trap((struct kernel_vm86_regs *) regs,
error_code, 1);
preempt_conditional_cli(regs);
+ debug_stack_usage_dec();
return;
}
@@ -441,6 +449,7 @@ dotraplinkage void __kprobes do_debug(st
if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
send_sigtrap(tsk, regs, error_code, si_code);
preempt_conditional_cli(regs);
+ debug_stack_usage_dec();
return;
}
@@ -568,44 +577,19 @@ asmlinkage void __attribute__((weak)) sm
#endif /* CONFIG_XEN */
/*
- * __math_state_restore assumes that cr0.TS is already clear and the
- * fpu state is all ready for use. Used during context switch.
- */
-void __math_state_restore(void)
-{
- struct thread_info *thread = current_thread_info();
- struct task_struct *tsk = thread->task;
-
- /*
- * Paranoid restore. send a SIGSEGV if we fail to restore the state.
- */
- if (unlikely(restore_fpu_checking(tsk))) {
- stts();
- force_sig(SIGSEGV, tsk);
- return;
- }
-
- thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
- tsk->fpu_counter++;
-}
-
-/*
* 'math_state_restore()' saves the current math information in the
* old math state array, and gets the new ones from the current task
*
* Careful.. There are problems with IBM-designed IRQ13 behaviour.
* Don't touch unless you *really* know how it works.
*
- * Must be called with kernel preemption disabled (in this case,
- * local interrupts are disabled at the call-site in entry.S).
+ * Must be called with kernel preemption disabled (eg with local
+ * local interrupts as in the case of do_device_not_available).
*/
-asmlinkage void math_state_restore(void)
+static void _math_state_restore(void)
{
- struct thread_info *thread = current_thread_info();
- struct task_struct *tsk = thread->task;
+ struct task_struct *tsk = current;
- /* NB. 'clts' is done for us by Xen during virtual trap. */
- percpu_and(xen_x86_cr0, ~X86_CR0_TS);
if (!tsk_used_math(tsk)) {
stts();
local_irq_enable();
@@ -623,8 +607,23 @@ asmlinkage void math_state_restore(void)
clts();
}
- /* clts(); Allow maths ops (or we recurse) */
- __math_state_restore();
+ xen_thread_fpu_begin(tsk, NULL);
+ /*
+ * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+ */
+ if (unlikely(restore_fpu_checking(tsk))) {
+ __thread_fpu_end(tsk);
+ force_sig(SIGSEGV, tsk);
+ return;
+ }
+
+ tsk->fpu_counter++;
+}
+
+void math_state_restore(void)
+{
+ clts();
+ _math_state_restore();
}
dotraplinkage void __kprobes
@@ -641,7 +640,9 @@ do_device_not_available(struct pt_regs *
return;
}
#endif
- math_state_restore(); /* interrupts still off */
+ /* NB. 'clts' is done for us by Xen during virtual trap. */
+ percpu_and(xen_x86_cr0, ~X86_CR0_TS);
+ _math_state_restore(); /* interrupts still off */
#ifdef CONFIG_X86_32
conditional_sti(regs);
#endif
--- head.orig/arch/x86/kernel/vsyscall_64-xen.c 2011-11-17 15:56:06.000000000 +0100
+++ head/arch/x86/kernel/vsyscall_64-xen.c 2013-05-23 17:48:42.000000000 +0200
@@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, v
.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
};
-static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE;
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
static int __init vsyscall_setup(char *str)
{
@@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned
return nr;
}
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
+{
+ /*
+ * XXX: if access_ok, get_user, and put_user handled
+ * sig_on_uaccess_error, this could go away.
+ */
+
+ if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
+ siginfo_t info;
+ struct thread_struct *thread = ¤t->thread;
+
+ thread->error_code = 6; /* user fault, no page, write */
+ thread->cr2 = ptr;
+ thread->trap_no = 14;
+
+ memset(&info, 0, sizeof(info));
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_MAPERR;
+ info.si_addr = (void __user *)ptr;
+
+ force_sig_info(SIGSEGV, &info, current);
+ return false;
+ } else {
+ return true;
+ }
+}
+
bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
{
struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr;
+ int prev_sig_on_uaccess_error;
long ret;
/*
@@ -180,18 +209,43 @@ bool emulate_vsyscall(struct pt_regs *re
if (seccomp_mode(&tsk->seccomp))
do_exit(SIGKILL);
+ /*
+ * With a real vsyscall, page faults cause SIGSEGV. We want to
+ * preserve that behavior to make writing exploits harder.
+ */
+ prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+ current_thread_info()->sig_on_uaccess_error = 1;
+
+ /*
+ * 0 is a valid user pointer (in the access_ok sense) on 32-bit and
+ * 64-bit, so we don't need to special-case it here. For all the
+ * vsyscalls, 0 means "don't write anything" not "write it at
+ * address 0".
+ */
+ ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
+ if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+ !write_ok_or_segv(regs->si, sizeof(struct timezone)))
+ break;
+
ret = sys_gettimeofday(
(struct timeval __user *)regs->di,
(struct timezone __user *)regs->si);
break;
case 1:
+ if (!write_ok_or_segv(regs->di, sizeof(time_t)))
+ break;
+
ret = sys_time((time_t __user *)regs->di);
break;
case 2:
+ if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+ !write_ok_or_segv(regs->si, sizeof(unsigned)))
+ break;
+
ret = sys_getcpu((unsigned __user *)regs->di,
(unsigned __user *)regs->si,
0);
@@ -201,17 +255,22 @@ bool emulate_vsyscall(struct pt_regs *re
break;
}
+ current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
+
if (ret == -EFAULT) {
- /*
- * Bad news -- userspace fed a bad pointer to a vsyscall.
- *
- * With a real vsyscall, that would have caused SIGSEGV.
- * To make writing reliable exploits using the emulated
- * vsyscalls harder, generate SIGSEGV here as well.
- */
+ /* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
- goto sigsegv;
+
+ /*
+ * If we failed to generate a signal for any reason,
+ * generate one here. (This should be impossible.)
+ */
+ if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+ !sigismember(&tsk->pending.signal, SIGSEGV)))
+ goto sigsegv;
+
+ return true; /* Don't emulate the ret. */
}
regs->ax = ret;
--- head.orig/arch/x86/mm/fault-xen.c 2011-11-17 15:56:06.000000000 +0100
+++ head/arch/x86/mm/fault-xen.c 2012-02-16 13:54:07.000000000 +0100
@@ -635,7 +635,7 @@ pgtable_bad(struct pt_regs *regs, unsign
static noinline void
no_context(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+ unsigned long address, int signal, int si_code)
{
struct task_struct *tsk = current;
unsigned long *stackend;
@@ -643,8 +643,17 @@ no_context(struct pt_regs *regs, unsigne
int sig;
/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs))
+ if (fixup_exception(regs)) {
+ if (current_thread_info()->sig_on_uaccess_error && signal) {
+ tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code | PF_USER;
+ tsk->thread.cr2 = address;
+
+ /* XXX: hwpoison faults will set the wrong code. */
+ force_sig_info_fault(signal, si_code, address, tsk, 0);
+ }
return;
+ }
/*
* 32-bit:
@@ -673,7 +682,7 @@ no_context(struct pt_regs *regs, unsigne
stackend = end_of_stack(tsk);
if (tsk != &init_task && *stackend != STACK_END_MAGIC)
- printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
tsk->thread.cr2 = address;
tsk->thread.trap_no = 14;
@@ -684,7 +693,7 @@ no_context(struct pt_regs *regs, unsigne
sig = 0;
/* Executive summary in case the body of the oops scrolled away */
- printk(KERN_EMERG "CR2: %016lx\n", address);
+ printk(KERN_DEFAULT "CR2: %016lx\n", address);
oops_end(flags, regs, sig);
}
@@ -764,7 +773,7 @@ __bad_area_nosemaphore(struct pt_regs *r
if (is_f00f_bug(regs, address))
return;
- no_context(regs, error_code, address);
+ no_context(regs, error_code, address, SIGSEGV, si_code);
}
static noinline void
@@ -828,7 +837,7 @@ do_sigbus(struct pt_regs *regs, unsigned
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
- no_context(regs, error_code, address);
+ no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@@ -863,7 +872,7 @@ mm_fault_error(struct pt_regs *regs, uns
if (!(fault & VM_FAULT_RETRY))
up_read(¤t->mm->mmap_sem);
if (!(error_code & PF_USER))
- no_context(regs, error_code, address);
+ no_context(regs, error_code, address, 0, 0);
return 1;
}
if (!(fault & VM_FAULT_ERROR))
@@ -873,7 +882,8 @@ mm_fault_error(struct pt_regs *regs, uns
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
up_read(¤t->mm->mmap_sem);
- no_context(regs, error_code, address);
+ no_context(regs, error_code, address,
+ SIGSEGV, SEGV_MAPERR);
return 1;
}
--- head.orig/arch/x86/mm/init-xen.c 2013-04-05 09:16:42.000000000 +0200
+++ head/arch/x86/mm/init-xen.c 2013-08-15 13:02:31.000000000 +0200
@@ -16,6 +16,7 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/proto.h>
+#include <asm/dma.h> /* for MAX_DMA_PFN */
unsigned long __meminitdata pgt_buf_start;
unsigned long __meminitdata pgt_buf_end;
@@ -80,10 +81,10 @@ static void __init find_early_table_spac
pgt_buf_end = pgt_buf_start;
} else {
/*
- * [table_start, table_top) gets passed to
- * memblock_x86_reserve_range(), so we must not use table_end
- * here, despite continuing to allocate from there. table_end
- * possibly being below table_start is otoh not a problem.
+ * [table_start, table_top) gets passed to memblock_reserve(),
+ * so we must not use table_end here, despite continuing to
+ * allocate from there. table_end possibly being below
+ * table_start is otoh not a problem.
*/
pgt_buf_start = pgt_buf_top;
}
@@ -99,7 +100,8 @@ static void __init find_early_table_spac
void __init xen_pagetable_reserve(u64 start, u64 end)
{
- memblock_x86_reserve_range(start, end, "PGTABLE");
+ if (end > start)
+ memblock_reserve(start, end - start);
}
struct map_range {
@@ -341,8 +343,8 @@ unsigned long __init_refok init_memory_m
* pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
* so that they can be reused for other purposes.
*
- * On native it just means calling memblock_x86_reserve_range, on Xen it
- * also means marking RW the pagetable pages that we allocated before
+ * On native it just means calling memblock_reserve, on Xen it also
+ * means marking RW the pagetable pages that we allocated before
* but that haven't been used.
*
* In fact on xen we mark RO the whole range pgt_buf_start -
@@ -468,3 +470,24 @@ void free_initrd_mem(unsigned long start
free_init_pages("initrd memory", start, PAGE_ALIGN(end));
}
#endif
+
+void __init zone_sizes_init(void)
+{
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
+#ifdef CONFIG_ZONE_DMA
+ max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+#endif
+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
+#endif
+
+ free_area_init_nodes(max_zone_pfns);
+}
+
--- head.orig/arch/x86/mm/init_32-xen.c 2011-07-01 15:19:35.000000000 +0200
+++ head/arch/x86/mm/init_32-xen.c 2012-02-09 15:46:24.000000000 +0100
@@ -463,23 +463,17 @@ static void __init add_one_highpage_init
void __init add_highpages_with_active_regions(int nid,
unsigned long start_pfn, unsigned long end_pfn)
{
- struct range *range;
- int nr_range;
- int i;
-
- nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn);
-
- for (i = 0; i < nr_range; i++) {
- struct page *page;
- int node_pfn;
-
- for (node_pfn = range[i].start; node_pfn < range[i].end;
- node_pfn++) {
- if (!pfn_valid(node_pfn))
- continue;
- page = pfn_to_page(node_pfn);
- add_one_highpage_init(page);
- }
+ phys_addr_t start, end;
+ u64 i;
+
+ for_each_free_mem_range(i, nid, &start, &end, NULL) {
+ unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
+ start_pfn, end_pfn);
+ unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
+ start_pfn, end_pfn);
+ for ( ; pfn < e_pfn; pfn++)
+ if (pfn_valid(pfn))
+ add_one_highpage_init(pfn_to_page(pfn));
}
}
#else
@@ -652,18 +646,18 @@ void __init initmem_init(void)
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
highstart_pfn = max_low_pfn;
- memblock_x86_register_active_regions(0, 0, highend_pfn);
- sparse_memory_present_with_active_regions(0);
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
- memblock_x86_register_active_regions(0, 0, max_low_pfn);
- sparse_memory_present_with_active_regions(0);
num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
+
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+ sparse_memory_present_with_active_regions(0);
+
#ifdef CONFIG_FLATMEM
max_mapnr = num_physpages;
#endif
@@ -676,30 +670,8 @@ void __init initmem_init(void)
}
#endif /* !CONFIG_NEED_MULTIPLE_NODES */
-static void __init zone_sizes_init(void)
-{
- unsigned long max_zone_pfns[MAX_NR_ZONES];
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-#ifdef CONFIG_ZONE_DMA
- max_zone_pfns[ZONE_DMA] =
- virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-#endif
- max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
- max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-#endif
-
- free_area_init_nodes(max_zone_pfns);
-}
-
void __init setup_bootmem_allocator(void)
{
-#ifdef CONFIG_XEN
- if (max_low_pfn > xen_start_info->nr_pages)
- memblock_x86_reserve_range(xen_start_info->nr_pages << PAGE_SHIFT,
- max_low_pfn << PAGE_SHIFT, "BALLOON");
-#endif
-
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
@@ -753,8 +725,7 @@ unsigned long __init extend_init_mapping
}
if (start_pfn > start)
- memblock_x86_reserve_range(start << PAGE_SHIFT,
- start_pfn << PAGE_SHIFT, "INITMAP");
+ memblock_reserve(PFN_PHYS(start), PFN_PHYS(start_pfn - start));
return start_pfn;
}
@@ -821,6 +792,17 @@ void __init mem_init(void)
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
+ /*
+ * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
+ * be done before free_all_bootmem(). Memblock use free low memory for
+ * temporary data (see find_range_array()) and for this purpose can use
+ * pages that was already passed to the buddy allocator, hence marked as
+ * not accessible in the page tables when compiled with
+ * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
+ * important here.
+ */
+ set_highmem_pages_init();
+
/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
/* XEN: init low-mem pages outside initial allocation. */
@@ -837,8 +819,6 @@ void __init mem_init(void)
if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
reservedpages++;
- set_highmem_pages_init();
-
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
--- head.orig/arch/x86/mm/init_64-xen.c 2013-04-05 09:18:42.000000000 +0200
+++ head/arch/x86/mm/init_64-xen.c 2013-04-05 09:21:15.000000000 +0200
@@ -457,7 +457,7 @@ static inline int __meminit make_readonl
* No need for writable mapping of kernel image. This also ensures that
* page and descriptor tables embedded inside don't have writable
* mappings. The range must be in sync with that passed to
- * memblock_x86_reserve_range() (as "TEXT DATA BSS"), since all other
+ * memblock_reserve() (covering kernel code and data), since all other
* regions can be allocated from under CONFIG_NO_BOOTMEM and thus must
* be writable.
*/
@@ -863,26 +863,12 @@ kernel_physical_mapping_init(unsigned lo
#ifndef CONFIG_NUMA
void __init initmem_init(void)
{
- memblock_x86_register_active_regions(0, 0, max_pfn);
-#ifdef CONFIG_XEN
- if (max_pfn > xen_start_info->nr_pages)
- memblock_x86_reserve_range(xen_start_info->nr_pages << PAGE_SHIFT,
- max_pfn << PAGE_SHIFT, "BALLOON");
-#endif
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
}
#endif
void __init paging_init(void)
{
- unsigned long max_zone_pfns[MAX_NR_ZONES];
-
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-#ifdef CONFIG_ZONE_DMA
- max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-#endif
- max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
- max_zone_pfns[ZONE_NORMAL] = max_pfn;
-
sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
@@ -894,7 +880,7 @@ void __init paging_init(void)
*/
node_clear_state(0, N_NORMAL_MEMORY);
- free_area_init_nodes(max_zone_pfns);
+ zone_sizes_init();
SetPagePinned(virt_to_page(init_mm.pgd));
}
--- head.orig/arch/x86/mm/pageattr-xen.c 2011-04-13 17:01:32.000000000 +0200
+++ head/arch/x86/mm/pageattr-xen.c 2012-02-09 12:32:50.000000000 +0100
@@ -1083,7 +1083,7 @@ out_err:
}
EXPORT_SYMBOL(set_memory_uc);
-int _set_memory_array(unsigned long *addr, int addrinarray,
+static int _set_memory_array(unsigned long *addr, int addrinarray,
unsigned long new_type)
{
int i, j;
@@ -1419,12 +1419,6 @@ void kernel_map_pages(struct page *page,
}
/*
- * If page allocator is not up yet then do not call c_p_a():
- */
- if (!debug_pagealloc_enabled)
- return;
-
- /*
* The return value is ignored as the calls cannot fail.
* Large pages for identity mappings are not used at boot time
* and hence no memory allocations during large page split.
--- head.orig/drivers/acpi/osl.c 2014-05-06 08:31:14.000000000 +0200
+++ head/drivers/acpi/osl.c 2013-12-03 08:27:53.000000000 +0100
@@ -324,8 +324,12 @@ acpi_map_lookup_virt(void __iomem *virt,
}
#ifndef CONFIG_IA64
+#ifndef CONFIG_XEN
#define should_use_kmap(pfn) page_is_ram(pfn)
#else
+#define should_use_kmap(mfn) pfn_valid(pfn = mfn_to_local_pfn(mfn))
+#endif
+#else
/* ioremap will take care of cache attributes */
#define should_use_kmap(pfn) 0
#endif
--- head.orig/drivers/hwmon/coretemp-xen.c 2011-11-17 16:53:49.000000000 +0100
+++ head/drivers/hwmon/coretemp-xen.c 2014-05-02 08:54:53.000000000 +0200
@@ -341,7 +341,7 @@ static int create_name_attr(struct platf
}
static int create_core_attrs(struct temp_data *tdata, struct device *dev,
- int attr_no)
+ int attr_no)
{
int err, i;
static ssize_t (*const rd_ptr[TOTAL_ATTRS]) (struct device *dev,
--- head.orig/drivers/pci/msi-xen.c 2011-11-17 15:56:06.000000000 +0100
+++ head/drivers/pci/msi-xen.c 2014-04-11 16:01:34.000000000 +0200
@@ -38,18 +38,21 @@ static int pci_seg_supported = 1;
static LIST_HEAD(msi_dev_head);
DEFINE_SPINLOCK(msi_dev_lock);
+struct msi_pirq_entry {
+ struct list_head list;
+ int pirq;
+ int entry_nr;
+ struct msi_dev_list *dev_entry;
+ struct kobject kobj;
+};
+
struct msi_dev_list {
struct pci_dev *dev;
- struct list_head list;
spinlock_t pirq_list_lock;
/* Store default pre-assigned irq */
unsigned int default_irq;
-};
-
-struct msi_pirq_entry {
- struct list_head list;
- int pirq;
- int entry_nr;
+ domid_t owner;
+ struct msi_pirq_entry e;
};
/* Arch hooks */
@@ -89,6 +92,21 @@ static void msix_set_enable(struct pci_d
}
}
+static int (*get_owner)(struct pci_dev *dev);
+
+static domid_t msi_get_dev_owner(struct pci_dev *dev)
+{
+ int owner;
+
+ if (is_initial_xendomain()
+ && get_owner && (owner = get_owner(dev)) >= 0) {
+ dev_info(&dev->dev, "get owner: %u\n", owner);
+ return owner;
+ }
+
+ return DOMID_SELF;
+}
+
static struct msi_dev_list *get_msi_dev_pirq_list(struct pci_dev *dev)
{
struct msi_dev_list *msi_dev_list, *ret = NULL;
@@ -96,12 +114,14 @@ static struct msi_dev_list *get_msi_dev_
spin_lock_irqsave(&msi_dev_lock, flags);
- list_for_each_entry(msi_dev_list, &msi_dev_head, list)
+ list_for_each_entry(msi_dev_list, &msi_dev_head, e.list)
if ( msi_dev_list->dev == dev )
ret = msi_dev_list;
if ( ret ) {
spin_unlock_irqrestore(&msi_dev_lock, flags);
+ if (ret->owner == DOMID_IO)
+ ret->owner = msi_get_dev_owner(dev);
return ret;
}
@@ -116,7 +136,10 @@ static struct msi_dev_list *get_msi_dev_
ret->dev = dev;
spin_lock_init(&ret->pirq_list_lock);
- list_add_tail(&ret->list, &msi_dev_head);
+ ret->owner = msi_get_dev_owner(dev);
+ ret->e.entry_nr = -1;
+ ret->e.dev_entry = ret;
+ list_add_tail(&ret->e.list, &msi_dev_head);
spin_unlock_irqrestore(&msi_dev_lock, flags);
return ret;
}
@@ -131,6 +154,8 @@ static int attach_pirq_entry(int pirq, i
return -ENOMEM;
entry->pirq = pirq;
entry->entry_nr = entry_nr;
+ entry->dev_entry = msi_dev_entry;
+ memset(&entry->kobj, 0, sizeof(entry->kobj));
spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
list_add_tail(&entry->list, &msi_dev_entry->dev->msi_list);
spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
@@ -154,11 +179,10 @@ static void detach_pirq_entry(int entry_
}
}
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
/*
* pciback will provide device's owner
*/
-static int (*get_owner)(struct pci_dev *dev);
-
int register_msi_get_owner(int (*func)(struct pci_dev *dev))
{
if (get_owner) {
@@ -178,26 +202,15 @@ int unregister_msi_get_owner(int (*func)
return 0;
}
EXPORT_SYMBOL(unregister_msi_get_owner);
+#endif
-static int msi_get_dev_owner(struct pci_dev *dev)
-{
- int owner;
-
- BUG_ON(!is_initial_xendomain());
- if (get_owner && (owner = get_owner(dev)) >= 0) {
- dev_info(&dev->dev, "get owner: %x \n", owner);
- return owner;
- }
-
- return DOMID_SELF;
-}
-
-static int msi_unmap_pirq(struct pci_dev *dev, int pirq)
+static int msi_unmap_pirq(struct pci_dev *dev, int pirq, domid_t owner,
+ struct kobject *kobj)
{
struct physdev_unmap_pirq unmap;
int rc;
- unmap.domid = msi_get_dev_owner(dev);
+ unmap.domid = owner;
/* See comments in msi_map_vector, input parameter pirq means
* irq number only if the device belongs to dom0 itself.
*/
@@ -210,6 +223,16 @@ static int msi_unmap_pirq(struct pci_dev
if (rc < 0)
return rc;
+ /*
+ * Its possible that we get into this path when populate_msi_sysfs()
+ * fails, which means the entries were not registered with sysfs.
+ * In that case don't unregister them.
+ */
+ if (kobj->parent) {
+ kobject_del(kobj);
+ kobject_put(kobj);
+ }
+
if (unmap.domid == DOMID_SELF)
evtchn_map_pirq(pirq, 0);
@@ -235,13 +258,11 @@ static u64 find_table_base(struct pci_de
/*
* Protected by msi_lock
*/
-static int msi_map_vector(struct pci_dev *dev, int entry_nr, u64 table_base)
+static int msi_map_vector(struct pci_dev *dev, int entry_nr, u64 table_base,
+ domid_t domid)
{
struct physdev_map_pirq map_irq;
int rc = -EINVAL;
- domid_t domid = DOMID_SELF;
-
- domid = msi_get_dev_owner(dev);
map_irq.domid = domid;
map_irq.type = MAP_PIRQ_TYPE_MSI_SEG;
@@ -340,6 +361,142 @@ void pci_restore_msi_state(struct pci_de
}
EXPORT_SYMBOL_GPL(pci_restore_msi_state);
+
+#define to_msi_attr(obj) container_of(obj, struct msi_attribute, attr)
+#define to_pirq_entry(obj) container_of(obj, struct msi_pirq_entry, kobj)
+
+struct msi_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct msi_pirq_entry *, struct msi_attribute *,
+ char *buf);
+ ssize_t (*store)(struct msi_pirq_entry *, struct msi_attribute *,
+ const char *buf, size_t count);
+};
+
+static ssize_t show_msi_mode(struct msi_pirq_entry *entry,
+ struct msi_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%s\n", entry->entry_nr >= 0 ? "msix" : "msi");
+}
+
+static ssize_t show_xen_irq(struct msi_pirq_entry *entry,
+ struct msi_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", entry->dev_entry->owner == DOMID_SELF
+ ? evtchn_get_xen_pirq(entry->pirq)
+ : entry->pirq);
+}
+
+static ssize_t msi_irq_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct msi_attribute *attribute = to_msi_attr(attr);
+ struct msi_pirq_entry *entry = to_pirq_entry(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(entry, attribute, buf);
+}
+
+static const struct sysfs_ops msi_irq_sysfs_ops = {
+ .show = msi_irq_attr_show,
+};
+
+static struct msi_attribute mode_attribute =
+ __ATTR(mode, S_IRUGO, show_msi_mode, NULL);
+
+static struct msi_attribute xen_irq_attribute =
+ __ATTR(xen_irq, S_IRUGO, show_xen_irq, NULL);
+
+static struct attribute *msi_irq_default_attrs[] = {
+ &mode_attribute.attr,
+ &xen_irq_attribute.attr,
+ NULL
+};
+
+static struct attribute *msi_pirq_default_attrs[] = {
+ &mode_attribute.attr,
+ NULL
+};
+
+static void msi_kobj_release(struct kobject *kobj)
+{
+ struct msi_dev_list *entry = to_pirq_entry(kobj)->dev_entry;
+
+ pci_dev_put(entry->dev);
+}
+
+static struct kobj_type msi_irq_ktype = {
+ .release = msi_kobj_release,
+ .sysfs_ops = &msi_irq_sysfs_ops,
+ .default_attrs = msi_irq_default_attrs,
+};
+
+static struct kobj_type msi_pirq_ktype = {
+ .release = msi_kobj_release,
+ .sysfs_ops = &msi_irq_sysfs_ops,
+ .default_attrs = msi_pirq_default_attrs,
+};
+
+static int populate_msi_sysfs(struct pci_dev *pdev)
+{
+ struct msi_dev_list *dev_entry = get_msi_dev_pirq_list(pdev);
+ domid_t owner = dev_entry->owner;
+ struct msi_pirq_entry *pirq_entry;
+ struct kobject *kobj;
+ int ret;
+ int count = 0;
+
+ pdev->msi_kset = kset_create_and_add("msi_irqs", NULL, &pdev->dev.kobj);
+ if (!pdev->msi_kset)
+ return -ENOMEM;
+
+ if (pdev->msi_enabled) {
+ kobj = &dev_entry->e.kobj;
+ kobj->kset = pdev->msi_kset;
+ pci_dev_get(pdev);
+ if (owner == DOMID_SELF)
+ ret = kobject_init_and_add(kobj, &msi_irq_ktype, NULL,
+ "%u", pdev->irq);
+ else
+ ret = kobject_init_and_add(kobj, &msi_pirq_ktype, NULL,
+ "xen-%u", pdev->irq);
+ if (ret)
+ pci_dev_put(pdev);
+ return ret;
+ }
+
+ list_for_each_entry(pirq_entry, &pdev->msi_list, list) {
+ kobj = &pirq_entry->kobj;
+ kobj->kset = pdev->msi_kset;
+ pci_dev_get(pdev);
+ if (owner == DOMID_SELF)
+ ret = kobject_init_and_add(kobj, &msi_irq_ktype, NULL,
+ "%u", pirq_entry->pirq);
+ else
+ ret = kobject_init_and_add(kobj, &msi_pirq_ktype, NULL,
+ "xen-%u", pirq_entry->pirq);
+ if (ret)
+ goto out_unroll;
+
+ count++;
+ }
+
+ return 0;
+
+out_unroll:
+ pci_dev_put(pdev);
+ list_for_each_entry(pirq_entry, &pdev->msi_list, list) {
+ if (!count)
+ break;
+ kobject_del(&pirq_entry->kobj);
+ kobject_put(&pirq_entry->kobj);
+ count--;
+ }
+ return ret;
+}
+
/**
* msi_capability_init - configure device's MSI capability structure
* @dev: pointer to the pci_dev data structure of MSI device function
@@ -353,6 +510,7 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state)
*/
static int msi_capability_init(struct pci_dev *dev, int nvec)
{
+ struct msi_dev_list *dev_entry = get_msi_dev_pirq_list(dev);
int pos, pirq;
u16 control;
@@ -361,7 +519,7 @@ static int msi_capability_init(struct pc
pci_read_config_word(dev, msi_control_reg(pos), &control);
- pirq = msi_map_vector(dev, 0, 0);
+ pirq = msi_map_vector(dev, 0, 0, dev_entry->owner);
if (pirq < 0)
return -EBUSY;
@@ -370,7 +528,8 @@ static int msi_capability_init(struct pc
msi_set_enable(dev, pos, 1);
dev->msi_enabled = 1;
- dev->irq = pirq;
+ dev->irq = dev_entry->e.pirq = pirq;
+ populate_msi_sysfs(dev);
return 0;
}
@@ -431,7 +590,8 @@ static int msix_capability_init(struct p
}
if (mapped)
continue;
- pirq = msi_map_vector(dev, entries[i].entry, table_base);
+ pirq = msi_map_vector(dev, entries[i].entry, table_base,
+ msi_dev_entry->owner);
if (pirq < 0)
break;
attach_pirq_entry(pirq, entries[i].entry, msi_dev_entry);
@@ -441,7 +601,12 @@ static int msix_capability_init(struct p
if (i != nvec) {
int avail = i - 1;
for (j = --i; j >= 0; j--) {
- msi_unmap_pirq(dev, entries[j].vector);
+ list_for_each_entry(pirq_entry, &dev->msi_list, list)
+ if (pirq_entry->entry_nr == entries[i].entry)
+ break;
+ msi_unmap_pirq(dev, entries[j].vector,
+ msi_dev_entry->owner,
+ &pirq_entry->kobj);
detach_pirq_entry(entries[j].entry, msi_dev_entry);
entries[j].vector = 0;
}
@@ -456,6 +621,7 @@ static int msix_capability_init(struct p
/* Set MSI-X enabled bits and unmask the function */
pci_intx_for_msi(dev, 0);
dev->msix_enabled = 1;
+ populate_msi_sysfs(dev);
control &= ~PCI_MSIX_FLAGS_MASKALL;
pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
@@ -554,7 +720,7 @@ int pci_enable_msi_block(struct pci_dev
dev->irq = evtchn_map_pirq(-1, dev->irq);
dev->msi_enabled = 1;
msi_dev_entry->default_irq = temp;
-
+ populate_msi_sysfs(dev);
return ret;
#else
return -EOPNOTSUPP;
@@ -599,7 +765,10 @@ void pci_msi_shutdown(struct pci_dev *de
pirq = dev->irq;
/* Restore dev->irq to its default pin-assertion vector */
dev->irq = msi_dev_entry->default_irq;
- msi_unmap_pirq(dev, pirq);
+ msi_unmap_pirq(dev, pirq, msi_dev_entry->owner,
+ &msi_dev_entry->e.kobj);
+ msi_dev_entry->owner = DOMID_IO;
+ memset(&msi_dev_entry->e.kobj, 0, sizeof(msi_dev_entry->e.kobj));
/* Disable MSI mode */
pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
@@ -611,6 +780,8 @@ void pci_msi_shutdown(struct pci_dev *de
void pci_disable_msi(struct pci_dev *dev)
{
pci_msi_shutdown(dev);
+ kset_unregister(dev->msi_kset);
+ dev->msi_kset = NULL;
}
EXPORT_SYMBOL(pci_disable_msi);
@@ -688,6 +859,7 @@ int pci_enable_msix(struct pci_dev *dev,
attach_pirq_entry(irq, entries[i].entry, msi_dev_entry);
entries[i].vector = irq;
}
+ populate_msi_sysfs(dev);
return 0;
#else
return -EOPNOTSUPP;
@@ -754,6 +926,8 @@ void pci_msix_shutdown(struct pci_dev *d
void pci_disable_msix(struct pci_dev *dev)
{
pci_msix_shutdown(dev);
+ kset_unregister(dev->msi_kset);
+ dev->msi_kset = NULL;
}
EXPORT_SYMBOL(pci_disable_msix);
@@ -768,25 +942,35 @@ EXPORT_SYMBOL(pci_disable_msix);
**/
void msi_remove_pci_irq_vectors(struct pci_dev *dev)
{
- unsigned long flags;
struct msi_dev_list *msi_dev_entry;
- struct msi_pirq_entry *pirq_entry, *tmp;
if (!pci_msi_enable || !dev)
return;
msi_dev_entry = get_msi_dev_pirq_list(dev);
- spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
- list_for_each_entry_safe(pirq_entry, tmp, &dev->msi_list, list) {
+ for (;;) {
+ struct msi_pirq_entry *pirq_entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
+ pirq_entry = list_first_entry_or_null(&dev->msi_list,
+ struct msi_pirq_entry,
+ list);
+ if (pirq_entry)
+ list_del(&pirq_entry->list);
+ spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
+ if (!pirq_entry)
+ break;
if (is_initial_xendomain())
- msi_unmap_pirq(dev, pirq_entry->pirq);
+ msi_unmap_pirq(dev, pirq_entry->pirq,
+ msi_dev_entry->owner,
+ &pirq_entry->kobj);
else
evtchn_map_pirq(pirq_entry->pirq, 0);
- list_del(&pirq_entry->list);
kfree(pirq_entry);
}
- spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
+ msi_dev_entry->owner = DOMID_IO;
dev->irq = msi_dev_entry->default_irq;
}
@@ -809,5 +993,21 @@ EXPORT_SYMBOL(pci_msi_enabled);
void pci_msi_init_pci_dev(struct pci_dev *dev)
{
+ int pos;
INIT_LIST_HEAD(&dev->msi_list);
+
+ /* Disable the msi hardware to avoid screaming interrupts
+ * during boot. This is the power on reset default so
+ * usually this should be a noop.
+ * But on a Xen host don't do this for IOMMUs which the hypervisor
+ * is in control of (and hence has already enabled on purpose).
+ */
+ if (is_initial_xendomain()
+ && (dev->class >> 8) == PCI_CLASS_SYSTEM_IOMMU
+ && dev->vendor == PCI_VENDOR_ID_AMD)
+ return;
+ pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+ if (pos)
+ msi_set_enable(dev, pos, 0);
+ msix_set_enable(dev, 0);
}
--- head.orig/drivers/xen/Kconfig 2014-01-30 10:19:00.000000000 +0100
+++ head/drivers/xen/Kconfig 2014-01-30 10:22:49.000000000 +0100
@@ -22,10 +22,6 @@ config XEN_UNPRIVILEGED_GUEST
select PM
select SUSPEND
-config XEN_PRIVCMD
- def_bool y
- depends on PROC_FS
-
config XEN_XENBUS_DEV
def_bool y
depends on PROC_FS
@@ -602,7 +598,8 @@ endmenu
config XEN_PRIVCMD
tristate
- depends on XEN
+ depends on PARAVIRT_XEN || (XEN && PROC_FS)
+ default y if XEN
default m
config XEN_STUB
--- head.orig/drivers/xen/Makefile 2012-10-04 13:09:26.000000000 +0200
+++ head/drivers/xen/Makefile 2012-10-04 13:11:29.000000000 +0200
@@ -3,8 +3,10 @@ xen-biomerge-$(CONFIG_PARAVIRT_XEN) := b
xen-hotplug-$(CONFIG_PARAVIRT_XEN) := cpu_hotplug.o
xen-balloon_$(CONFIG_PARAVIRT_XEN) := xen-balloon.o
xen-evtchn-name-$(CONFIG_PARAVIRT_XEN) := xen-evtchn
+xen-privcmd_$(CONFIG_PARAVIRT_XEN) := xen-privcmd.o
xen-balloon_$(CONFIG_XEN) := balloon/
+xen-privcmd_$(CONFIG_XEN) := privcmd/
obj-$(CONFIG_XEN) += core/
obj-$(CONFIG_XEN) += console/
obj-y += xenbus/
@@ -38,10 +40,12 @@ obj-$(CONFIG_XEN_TMEM) += tmem.o
obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
obj-$(CONFIG_XEN_DOM0) += pci.o
obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/
+obj-$(CONFIG_XEN_PRIVCMD) += $(xen-privcmd_y)
xen-evtchn-y := evtchn.o
xen-gntdev-y := gntdev.o
xen-gntalloc-y := gntalloc.o
+xen-privcmd-y := privcmd.o
obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
@@ -59,7 +63,6 @@ obj-$(CONFIG_XEN_SCSI_BACKEND) += scsib
obj-$(CONFIG_XEN_SCSI_FRONTEND) += scsifront/
obj-$(CONFIG_XEN_USB_BACKEND) += usbback/
obj-$(CONFIG_XEN_USB_FRONTEND) += usbfront/
-obj-$(CONFIG_XEN_PRIVCMD) += privcmd/
obj-$(CONFIG_XEN_GRANT_DEV) += gntdev/
obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL) += sfc_netutil/
obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND) += sfc_netfront/
--- head.orig/drivers/xen/balloon/balloon.c 2012-06-06 14:04:25.000000000 +0200
+++ head/drivers/xen/balloon/balloon.c 2014-01-22 14:17:38.000000000 +0100
@@ -73,11 +73,6 @@ static DEFINE_MUTEX(balloon_mutex);
*/
DEFINE_SPINLOCK(balloon_lock);
-#ifndef MODULE
-#include <linux/pagevec.h>
-static struct pagevec free_pagevec;
-#endif
-
struct balloon_stats balloon_stats;
/* We increase/decrease in batches which fit in a page */
@@ -198,27 +193,14 @@ static struct page *balloon_next_page(st
static inline void balloon_free_page(struct page *page)
{
#ifndef MODULE
- if (put_page_testzero(page) && !pagevec_add(&free_pagevec, page)) {
- __pagevec_free(&free_pagevec);
- pagevec_reinit(&free_pagevec);
- }
+ if (put_page_testzero(page))
+ free_hot_cold_page(page, 1);
#else
- /* pagevec interface is not being exported. */
+ /* free_hot_cold_page() is not being exported. */
__free_page(page);
#endif
}
-static inline void balloon_free_and_unlock(unsigned long flags)
-{
-#ifndef MODULE
- if (pagevec_count(&free_pagevec)) {
- __pagevec_free(&free_pagevec);
- pagevec_reinit(&free_pagevec);
- }
-#endif
- balloon_unlock(flags);
-}
-
static void balloon_alarm(unsigned long unused)
{
schedule_work(&balloon_worker);
@@ -330,7 +312,7 @@ static int increase_reservation(unsigned
totalram_pages = bs.current_pages - totalram_bias;
out:
- balloon_free_and_unlock(flags);
+ balloon_unlock(flags);
#ifndef MODULE
setup_per_zone_wmarks();
@@ -567,7 +549,6 @@ static int __init balloon_init(void)
IPRINTK("Initialising balloon driver.\n");
#ifdef CONFIG_XEN
- pagevec_init(&free_pagevec, true);
bs.current_pages = min(xen_start_info->nr_pages, max_pfn);
totalram_pages = bs.current_pages;
#else
@@ -720,7 +701,7 @@ struct page **alloc_empty_pages_and_page
if (ret != 0) {
balloon_free_page(page);
- balloon_free_and_unlock(flags);
+ balloon_unlock(flags);
goto err;
}
--- head.orig/drivers/xen/balloon/sysfs.c 2012-02-03 13:51:27.000000000 +0100
+++ head/drivers/xen/balloon/sysfs.c 2012-02-16 17:19:42.000000000 +0100
@@ -29,12 +29,11 @@
*/
#include <linux/capability.h>
+#include <linux/device.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/stat.h>
#include <linux/string.h>
-#include <linux/sysdev.h>
-#include <linux/module.h>
#include <xen/balloon.h>
#include "common.h"
@@ -45,27 +44,27 @@
#define BALLOON_CLASS_NAME "xen_memory"
#define BALLOON_SHOW(name, format, args...) \
- static ssize_t show_##name(struct sys_device *dev, \
- struct sysdev_attribute *attr, \
+ static ssize_t show_##name(struct device *dev, \
+ struct device_attribute *attr, \
char *buf) \
{ \
return sprintf(buf, format, ##args); \
} \
- static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
+ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages));
BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low));
BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high));
BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
-static ssize_t show_target_kb(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
+static ssize_t show_target_kb(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
}
-static ssize_t store_target_kb(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t store_target_kb(struct device *dev,
+ struct device_attribute *attr,
const char *buf, size_t count)
{
char *endchar;
@@ -83,19 +82,19 @@ static ssize_t store_target_kb(struct sy
return count;
}
-static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
+static DEVICE_ATTR(target_kb, S_IRUGO | S_IWUSR,
show_target_kb, store_target_kb);
-static ssize_t show_target(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
+static ssize_t show_target(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%llu\n",
(unsigned long long)balloon_stats.target_pages
<< PAGE_SHIFT);
}
-static ssize_t store_target(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t store_target(struct device *dev,
+ struct device_attribute *attr,
const char *buf,
size_t count)
{
@@ -114,19 +113,19 @@ static ssize_t store_target(struct sys_d
return count;
}
-static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR,
+static DEVICE_ATTR(target, S_IRUGO | S_IWUSR,
show_target, store_target);
-static struct sysdev_attribute *balloon_attrs[] = {
- &attr_target_kb,
- &attr_target,
+static struct device_attribute *balloon_attrs[] = {
+ &dev_attr_target_kb,
+ &dev_attr_target,
};
static struct attribute *balloon_info_attrs[] = {
- &attr_current_kb.attr,
- &attr_low_kb.attr,
- &attr_high_kb.attr,
- &attr_driver_kb.attr,
+ &dev_attr_current_kb.attr,
+ &dev_attr_low_kb.attr,
+ &dev_attr_high_kb.attr,
+ &dev_attr_driver_kb.attr,
NULL
};
@@ -135,36 +134,37 @@ static const struct attribute_group ball
.attrs = balloon_info_attrs,
};
-static struct sysdev_class balloon_sysdev_class = {
+static struct bus_type balloon_subsys = {
.name = BALLOON_CLASS_NAME,
+ .dev_name = BALLOON_CLASS_NAME,
};
-static struct sys_device balloon_sysdev;
+static struct device balloon_dev;
-static int __init register_balloon(struct sys_device *sysdev)
+static int __init register_balloon(struct device *dev)
{
int i, error;
- error = sysdev_class_register(&balloon_sysdev_class);
+ error = subsys_system_register(&balloon_subsys, NULL);
if (error)
return error;
- sysdev->id = 0;
- sysdev->cls = &balloon_sysdev_class;
+ dev->id = 0;
+ dev->bus = &balloon_subsys;
- error = sysdev_register(sysdev);
+ error = device_register(dev);
if (error) {
- sysdev_class_unregister(&balloon_sysdev_class);
+ bus_unregister(&balloon_subsys);
return error;
}
for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
- error = sysdev_create_file(sysdev, balloon_attrs[i]);
+ error = device_create_file(dev, balloon_attrs[i]);
if (error)
goto fail;
}
- error = sysfs_create_group(&sysdev->kobj, &balloon_info_group);
+ error = sysfs_create_group(&dev->kobj, &balloon_info_group);
if (error)
goto fail;
@@ -172,33 +172,33 @@ static int __init register_balloon(struc
fail:
while (--i >= 0)
- sysdev_remove_file(sysdev, balloon_attrs[i]);
- sysdev_unregister(sysdev);
- sysdev_class_unregister(&balloon_sysdev_class);
+ device_remove_file(dev, balloon_attrs[i]);
+ device_unregister(dev);
+ bus_unregister(&balloon_subsys);
return error;
}
-static __exit void unregister_balloon(struct sys_device *sysdev)
+static __exit void unregister_balloon(struct device *dev)
{
int i;
- sysfs_remove_group(&sysdev->kobj, &balloon_info_group);
+ sysfs_remove_group(&dev->kobj, &balloon_info_group);
for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++)
- sysdev_remove_file(sysdev, balloon_attrs[i]);
- sysdev_unregister(sysdev);
- sysdev_class_unregister(&balloon_sysdev_class);
+ device_remove_file(dev, balloon_attrs[i]);
+ device_unregister(dev);
+ bus_unregister(&balloon_subsys);
}
int __init balloon_sysfs_init(void)
{
- int rc = register_balloon(&balloon_sysdev);
+ int rc = register_balloon(&balloon_dev);
- register_xen_selfballooning(&balloon_sysdev);
+ register_xen_selfballooning(&balloon_dev);
return rc;
}
void __exit balloon_sysfs_exit(void)
{
- unregister_balloon(&balloon_sysdev);
+ unregister_balloon(&balloon_dev);
}
--- head.orig/drivers/xen/blkback/blkback.c 2013-06-20 15:28:23.000000000 +0200
+++ head/drivers/xen/blkback/blkback.c 2013-06-20 15:28:36.000000000 +0200
@@ -62,7 +62,7 @@ module_param_named(reqs, blkif_reqs, uin
MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
/* Run-time switchable: /sys/module/blkback/parameters/ */
-static int log_stats;
+static bool log_stats;
static unsigned int debug_lvl;
module_param(log_stats, bool, 0644);
module_param(debug_lvl, uint, 0644);
@@ -330,8 +330,11 @@ irqreturn_t blkif_be_int(int irq, void *
static void dispatch_discard(blkif_t *blkif, struct blkif_request_discard *req)
{
+ unsigned long secure = (blkif->vbd.discard_secure &&
+ (req->flag & BLKIF_DISCARD_SECURE)) ?
+ BLKDEV_DISCARD_SECURE : 0;
struct phys_req preq;
- int err = -EOPNOTSUPP, status;
+ int status;
blkif->st_ds_req++;
@@ -348,12 +351,8 @@ static void dispatch_discard(blkif_t *bl
return;
}
- if (blkif->blk_backend_type == BLKIF_BACKEND_PHY ||
- blkif->blk_backend_type == BLKIF_BACKEND_FILE)
- err = blkdev_issue_discard(preq.bdev, preq.sector_number,
- preq.nr_sects, GFP_KERNEL, 0);
-
- switch (err) {
+ switch (blkdev_issue_discard(preq.bdev, preq.sector_number,
+ preq.nr_sects, GFP_KERNEL, secure)) {
case 0:
status = BLKIF_RSP_OKAY;
break;
--- head.orig/drivers/xen/blkback/common.h 2013-06-20 15:28:24.000000000 +0200
+++ head/drivers/xen/blkback/common.h 2012-06-08 10:38:21.000000000 +0200
@@ -43,16 +43,12 @@
pr_debug("(file=%s, line=%d) " _f, \
__FILE__ , __LINE__ , ## _a )
-enum blkif_backend_type {
- BLKIF_BACKEND_PHY = 1,
- BLKIF_BACKEND_FILE = 2,
-};
-
struct vbd {
blkif_vdev_t handle; /* what the domain refers to this vbd as */
fmode_t mode; /* FMODE_xxx */
unsigned char type; /* VDISK_xxx */
bool flush_support;
+ bool discard_secure;
u32 pdevice; /* phys device that this vbd maps to */
struct block_device *bdev;
sector_t size; /* Cached size parameter */
@@ -68,7 +64,6 @@ typedef struct blkif_st {
unsigned int irq;
/* Comms information. */
enum blkif_protocol blk_protocol;
- enum blkif_backend_type blk_backend_type;
blkif_back_rings_t blk_rings;
struct vm_struct *blk_ring_area;
/* The VBD attached to this interface. */
--- head.orig/drivers/xen/blkback/vbd.c 2012-02-24 15:15:19.000000000 +0100
+++ head/drivers/xen/blkback/vbd.c 2012-02-27 10:22:57.000000000 +0100
@@ -92,6 +92,9 @@ int vbd_create(blkif_t *blkif, blkif_vde
if (q && q->flush_flags)
vbd->flush_support = true;
+ if (q && blk_queue_secdiscard(q))
+ vbd->discard_secure = true;
+
DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
handle, blkif->domid);
return 0;
--- head.orig/drivers/xen/blkback/xenbus.c 2012-12-18 12:11:36.000000000 +0100
+++ head/drivers/xen/blkback/xenbus.c 2012-12-18 12:11:42.000000000 +0100
@@ -228,43 +228,34 @@ static void blkback_discard(struct xenbu
struct backend_info *be)
{
struct xenbus_device *dev = be->dev;
- blkif_t *blkif = be->blkif;
- char *type = xenbus_read(XBT_NIL, dev->nodename, "type", NULL);
+ struct vbd *vbd = &be->blkif->vbd;
+ struct request_queue *q = bdev_get_queue(vbd->bdev);
int err, state = 0;
- if (!IS_ERR(type)) {
- if (strncmp(type, "file", 4) == 0) {
+ if (blk_queue_discard(q)) {
+ err = xenbus_printf(xbt, dev->nodename, "discard-granularity",
+ "%u", q->limits.discard_granularity);
+ if (!err)
state = 1;
- blkif->blk_backend_type = BLKIF_BACKEND_FILE;
+ else
+ xenbus_dev_error(dev, err,
+ "writing discard-granularity");
+ err = xenbus_printf(xbt, dev->nodename, "discard-alignment",
+ "%u", q->limits.discard_alignment);
+ if (err) {
+ xenbus_dev_error(dev, err,
+ "writing discard-alignment");
+ state = 0;
}
- if (strncmp(type, "phy", 3) == 0) {
- struct request_queue *q;
+ }
- q = bdev_get_queue(blkif->vbd.bdev);
- if (blk_queue_discard(q)) {
- blkif->blk_backend_type = BLKIF_BACKEND_PHY;
- err = xenbus_printf(xbt, dev->nodename,
- "discard-granularity", "%u",
- q->limits.discard_granularity);
- if (!err)
- state = 1;
- else
- xenbus_dev_error(dev, err,
- "writing discard-granularity");
- err = xenbus_printf(xbt, dev->nodename,
- "discard-alignment", "%u",
- q->limits.discard_alignment);
- if (err) {
- xenbus_dev_error(dev, err,
- "writing discard-alignment");
- state = 0;
- }
- }
- }
- kfree(type);
- } else
- xenbus_dev_error(dev, PTR_ERR(type),
- "reading type for discard");
+ /* Optional. */
+ if (state) {
+ err = xenbus_printf(xbt, dev->nodename, "discard-secure",
+ "%d", vbd->discard_secure);
+ if (err)
+ xenbus_dev_error(dev, err, "writing discard-secure");
+ }
err = xenbus_printf(xbt, dev->nodename, "feature-discard",
"%d", state);
--- head.orig/drivers/xen/blkfront/blkfront.c 2014-01-22 14:13:52.000000000 +0100
+++ head/drivers/xen/blkfront/blkfront.c 2014-01-22 14:16:56.000000000 +0100
@@ -332,6 +332,7 @@ static void blkfront_setup_discard(struc
{
unsigned int discard_granularity;
unsigned int discard_alignment;
+ int discard_secure;
info->feature_discard = 1;
if (!xenbus_gather(XBT_NIL, info->xbdev->otherend,
@@ -341,6 +342,10 @@ static void blkfront_setup_discard(struc
info->discard_granularity = discard_granularity;
info->discard_alignment = discard_alignment;
}
+ if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "discard-secure", "%d", &discard_secure) != 1)
+ discard_secure = 0;
+ info->feature_secdiscard = !!discard_secure;
}
/*
@@ -765,10 +770,13 @@ int blkif_ioctl(struct block_device *bd,
return scsi_cmd_ioctl(filep, info->rq,
info->gd, command,
(void __user *)argument);
-#else
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
return scsi_cmd_ioctl(info->rq, info->gd,
mode, command,
(void __user *)argument);
+#else
+ return scsi_cmd_blk_ioctl(bd, mode, command,
+ (void __user *)argument);
#endif
}
}
@@ -845,13 +853,15 @@ static int blkif_queue_request(struct re
#endif
ring_req->operation = info->flush_op;
- if (unlikely(req->cmd_flags & REQ_DISCARD)) {
+ if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
struct blkif_request_discard *discard = (void *)ring_req;
/* id, sector_number and handle are set above. */
discard->operation = BLKIF_OP_DISCARD;
discard->flag = 0;
discard->nr_sectors = blk_rq_sectors(req);
+ if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+ discard->flag = BLKIF_DISCARD_SECURE;
} else {
ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
@@ -1036,7 +1046,9 @@ static irqreturn_t blkif_int(int irq, vo
info->gd->disk_name);
ret = -EOPNOTSUPP;
info->feature_discard = 0;
+ info->feature_secdiscard = 0;
queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
+ queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
}
__blk_end_request_all(req, ret);
break;
@@ -1093,6 +1105,9 @@ static void blkif_free(struct blkfront_i
static void blkif_completion(struct blk_shadow *s)
{
int i;
+
+ if (s->req.operation == BLKIF_OP_DISCARD)
+ return;
for (i = 0; i < s->req.nr_segments; i++)
gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
}
--- head.orig/drivers/xen/blkfront/block.h 2013-05-31 13:38:17.000000000 +0200
+++ head/drivers/xen/blkfront/block.h 2013-05-31 13:38:29.000000000 +0200
@@ -109,7 +109,8 @@ struct blkfront_info
unsigned long shadow_free;
unsigned int feature_flush;
unsigned int flush_op;
- unsigned int feature_discard;
+ bool feature_discard;
+ bool feature_secdiscard;
unsigned int discard_granularity;
unsigned int discard_alignment;
int is_ready;
--- head.orig/drivers/xen/blkfront/vbd.c 2013-05-31 13:38:18.000000000 +0200
+++ head/drivers/xen/blkfront/vbd.c 2013-05-31 13:38:28.000000000 +0200
@@ -302,7 +302,7 @@ xlbd_reserve_minors(struct xlbd_major_in
if (end > ms->nr) {
unsigned long *bitmap, *old;
- bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
+ bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
GFP_KERNEL);
if (bitmap == NULL)
return -ENOMEM;
@@ -371,6 +371,8 @@ xlvbd_init_blk_queue(struct gendisk *gd,
blk_queue_max_discard_sectors(rq, get_capacity(gd));
rq->limits.discard_granularity = info->discard_granularity;
rq->limits.discard_alignment = info->discard_alignment;
+ if (info->feature_secdiscard)
+ queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
}
/* Hard sector size and max sectors impersonate the equiv. hardware. */
--- head.orig/drivers/xen/blktap/blktap.c 2011-11-18 15:35:59.000000000 +0100
+++ head/drivers/xen/blktap/blktap.c 2013-01-25 15:09:14.000000000 +0100
@@ -131,7 +131,7 @@ static struct tap_blkif *tapfds[MAX_TAP_
static int blktap_next_minor;
/* Run-time switchable: /sys/module/blktap/parameters/ */
-static int log_stats;
+static bool log_stats;
static unsigned int debug_lvl;
module_param(log_stats, bool, 0644);
module_param(debug_lvl, uint, 0644);
@@ -277,7 +277,7 @@ static inline unsigned int OFFSET_TO_SEG
} while(0)
-static char *blktap_devnode(struct device *dev, mode_t *mode)
+static char *blktap_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "xen/blktap%u", MINOR(dev->devt));
}
--- head.orig/drivers/xen/blktap2-new/device.c 2011-11-21 15:50:27.000000000 +0100
+++ head/drivers/xen/blktap2-new/device.c 2012-02-17 11:29:41.000000000 +0100
@@ -425,7 +425,7 @@ blktap_device_destroy_sync(struct blktap
!blktap_device_try_destroy(tap));
}
-static char *blktap_devnode(struct gendisk *gd, mode_t *mode)
+static char *blktap_devnode(struct gendisk *gd, umode_t *mode)
{
return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "tapdev%u",
gd->first_minor);
--- head.orig/drivers/xen/blktap2-new/sysfs.c 2011-02-24 15:02:50.000000000 +0100
+++ head/drivers/xen/blktap2-new/sysfs.c 2012-02-17 11:29:46.000000000 +0100
@@ -262,7 +262,7 @@ blktap_sysfs_show_devices(struct class *
}
static CLASS_ATTR(devices, S_IRUGO, blktap_sysfs_show_devices, NULL);
-static char *blktap_devnode(struct device *dev, mode_t *mode)
+static char *blktap_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "blktap%u",
MINOR(dev->devt));
--- head.orig/drivers/xen/blktap2/device.c 2012-02-16 13:44:17.000000000 +0100
+++ head/drivers/xen/blktap2/device.c 2012-02-17 11:29:27.000000000 +0100
@@ -1068,7 +1068,7 @@ blktap_device_destroy(struct blktap *tap
return 0;
}
-static char *blktap_devnode(struct gendisk *gd, mode_t *mode)
+static char *blktap_devnode(struct gendisk *gd, umode_t *mode)
{
return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "tapdev%u",
gd->first_minor);
--- head.orig/drivers/xen/blktap2/sysfs.c 2011-02-24 14:59:15.000000000 +0100
+++ head/drivers/xen/blktap2/sysfs.c 2012-02-17 11:29:32.000000000 +0100
@@ -439,7 +439,7 @@ blktap_sysfs_free(void)
class_destroy(class);
}
-static char *blktap_devnode(struct device *dev, mode_t *mode)
+static char *blktap_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "blktap%u",
MINOR(dev->devt));
--- head.orig/drivers/xen/core/cpu_hotplug.c 2011-02-01 14:42:26.000000000 +0100
+++ head/drivers/xen/core/cpu_hotplug.c 2013-01-30 11:57:29.000000000 +0100
@@ -25,7 +25,7 @@ static int local_cpu_hotplug_request(voi
return (current->mm != NULL);
}
-static void vcpu_hotplug(unsigned int cpu, struct sys_device *dev)
+static void vcpu_hotplug(unsigned int cpu, struct device *dev)
{
int err;
char dir[16], state[16];
@@ -63,7 +63,7 @@ static void handle_vcpu_hotplug_event(
if ((cpustr = strstr(node, "cpu/")) != NULL) {
sscanf(cpustr, "cpu/%u", &cpu);
- vcpu_hotplug(cpu, get_cpu_sysdev(cpu));
+ vcpu_hotplug(cpu, get_cpu_device(cpu));
}
}
@@ -96,7 +96,7 @@ static int setup_cpu_watcher(struct noti
if (!is_initial_xendomain()) {
for_each_possible_cpu(i)
- vcpu_hotplug(i, get_cpu_sysdev(i));
+ vcpu_hotplug(i, get_cpu_device(i));
pr_info("Brought up %ld CPUs\n", (long)num_online_cpus());
}
--- head.orig/drivers/xen/core/evtchn.c 2011-11-21 15:49:38.000000000 +0100
+++ head/drivers/xen/core/evtchn.c 2012-10-04 13:11:41.000000000 +0200
@@ -329,8 +329,8 @@ asmlinkage void __irq_entry evtchn_do_up
old_regs = set_irq_regs(regs);
xen_spin_irq_enter();
- exit_idle();
irq_enter();
+ exit_idle();
do {
vcpu_info->evtchn_upcall_pending = 0;
--- head.orig/drivers/xen/core/smpboot.c 2011-11-18 15:44:14.000000000 +0100
+++ head/drivers/xen/core/smpboot.c 2012-03-22 16:22:50.000000000 +0100
@@ -443,6 +443,7 @@ void __ref play_dead(void)
void __init smp_cpus_done(unsigned int max_cpus)
{
+ nmi_selftest();
}
#ifndef CONFIG_X86_LOCAL_APIC
--- head.orig/drivers/xen/core/spinlock.c 2014-03-11 10:27:46.000000000 +0100
+++ head/drivers/xen/core/spinlock.c 2014-01-07 17:17:01.000000000 +0100
@@ -37,7 +37,7 @@ struct rm_seq {
};
static DEFINE_PER_CPU(struct rm_seq, rm_seq);
-static int __read_mostly nopoll;
+static bool __read_mostly nopoll;
module_param(nopoll, bool, 0);
int __cpuinit xen_spinlock_init(unsigned int cpu)
@@ -139,9 +139,7 @@ static unsigned int ticket_drop(struct s
if (cmpxchg(&spinning->ticket, ticket, -1) != ticket)
return -1;
- asm volatile(UNLOCK_LOCK_PREFIX "inc" UNLOCK_SUFFIX(0) " %0"
- : "+m" (lock->tickets.head)
- : : "memory", "cc");
+ __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
ticket = (__ticket_t)(ticket + 1);
return ticket != lock->tickets.tail ? ticket : -1;
}
--- head.orig/drivers/xen/netback/interface.c 2013-10-29 10:17:45.000000000 +0100
+++ head/drivers/xen/netback/interface.c 2013-02-20 11:37:02.000000000 +0100
@@ -99,7 +99,8 @@ static int netbk_change_mtu(struct net_d
return 0;
}
-static u32 netbk_fix_features(struct net_device *dev, u32 features)
+static netdev_features_t netbk_fix_features(struct net_device *dev,
+ netdev_features_t features)
{
netif_t *netif = netdev_priv(dev);
--- head.orig/drivers/xen/netback/netback.c 2013-07-04 13:24:10.000000000 +0200
+++ head/drivers/xen/netback/netback.c 2013-07-04 13:24:14.000000000 +0200
@@ -51,6 +51,12 @@ struct netbk_rx_meta {
u8 copy:1;
};
+struct netbk_tx_cb {
+ u16 copy_slots;
+ u16 pending_idx[1 + XEN_NETIF_NR_SLOTS_MIN];
+};
+#define netbk_tx_cb(skb) ((struct netbk_tx_cb *)skb->cb)
+
struct netbk_tx_pending_inuse {
struct list_head list;
unsigned long alloc_time;
@@ -155,6 +161,8 @@ static struct sk_buff_head tx_queue;
static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
+static gnttab_copy_t tx_copy_ops[2 * MAX_PENDING_REQS];
+static netif_tx_request_t tx_slots[XEN_NETIF_NR_SLOTS_MIN];
static struct list_head net_schedule_list;
static spinlock_t net_schedule_list_lock;
@@ -163,11 +171,19 @@ static spinlock_t net_schedule_list_lock
static unsigned long mfn_list[MAX_MFN_ALLOC];
static unsigned int alloc_index = 0;
+/*
+ * This is the maximum slots a TX request can have. If a guest sends a TX
+ * request which exceeds this limit it is considered malicious.
+ */
+static unsigned int max_tx_slots = XEN_NETIF_NR_SLOTS_MIN;
+module_param(max_tx_slots, uint, 0444);
+MODULE_PARM_DESC(max_tx_slots, "Maximum number of slots accepted in netfront TX requests");
+
/* Setting this allows the safe use of this driver without netloop. */
-static int MODPARM_copy_skb = 1;
+static bool MODPARM_copy_skb = true;
module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
-static int MODPARM_permute_returns = 0;
+static bool MODPARM_permute_returns;
module_param_named(permute_returns, MODPARM_permute_returns, bool, S_IRUSR|S_IWUSR);
MODULE_PARM_DESC(permute_returns, "Randomly permute the order in which TX responses are sent to the frontend");
@@ -1050,26 +1066,48 @@ static int netbk_count_requests(netif_t
netif_tx_request_t *txp, int work_to_do)
{
RING_IDX cons = netif->tx.req_cons;
- int frags = 0, drop_err = 0;
+ int slots = 0, drop_err = 0;
if (!(first->flags & XEN_NETTXF_more_data))
return 0;
do {
- if (frags >= work_to_do) {
- netdev_err(netif->dev, "Need more frags\n");
+ if (slots >= work_to_do) {
+ netdev_err(netif->dev, "Need more slots\n");
netbk_fatal_tx_err(netif);
return -ENODATA;
}
- if (unlikely(frags >= MAX_SKB_FRAGS)) {
- netdev_err(netif->dev, "Too many frags\n");
+ if (unlikely(slots >= max_tx_slots)) {
+ netdev_err(netif->dev, "Too many slots\n");
netbk_fatal_tx_err(netif);
return -E2BIG;
}
- memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
- sizeof(*txp));
+ /*
+ * The Xen network protocol had an implicit dependency on
+ * MAX_SKB_FRAGS. XEN_NETIF_NR_SLOTS_MIN is set to the
+ * historical MAX_SKB_FRAGS value 18 to honor the same
+ * behavior as before. Any packet using more than 18 slots
+ * but less than max_tx_slots slots is dropped.
+ */
+ switch (slots) {
+ case 0 ... XEN_NETIF_NR_SLOTS_MIN - 1:
+ break;
+ case XEN_NETIF_NR_SLOTS_MIN:
+ if (net_ratelimit())
+ netdev_dbg(netif->dev,
+ "slot count exceeding limit of %d, dropping packet\n",
+ XEN_NETIF_NR_SLOTS_MIN);
+ if (!drop_err)
+ drop_err = -E2BIG;
+ /* fall through */
+ default:
+ --txp;
+ break;
+ }
+
+ *txp = *RING_GET_REQUEST(&netif->tx, cons + slots);
/*
* If the guest submitted a frame >= 64 KiB then first->size
@@ -1088,7 +1126,7 @@ static int netbk_count_requests(netif_t
}
first->size -= txp->size;
- frags++;
+ slots++;
if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
netdev_err(netif->dev, "txp->offset: %x, size: %u\n",
@@ -1099,30 +1137,77 @@ static int netbk_count_requests(netif_t
} while ((txp++)->flags & XEN_NETTXF_more_data);
if (drop_err) {
- netbk_tx_err(netif, first, cons + frags);
+ netbk_tx_err(netif, first, cons + slots);
return drop_err;
}
- return frags;
+ return slots;
+}
+
+struct netbk_tx_gop {
+ gnttab_map_grant_ref_t *map;
+ gnttab_copy_t *copy;
+ void *ptr;
+};
+
+static void netbk_fill_tx_copy(const netif_tx_request_t *txreq,
+ struct netbk_tx_gop *gop, domid_t domid)
+{
+ gop->copy--;
+ gop->copy->source.u.ref = txreq->gref;
+ gop->copy->source.domid = domid;
+ gop->copy->source.offset = txreq->offset;
+ gop->copy->dest.u.gmfn = virt_to_mfn(gop->ptr);
+ gop->copy->dest.domid = DOMID_SELF;
+ gop->copy->dest.offset = offset_in_page(gop->ptr);
+ gop->copy->flags = GNTCOPY_source_gref;
+
+ if (gop->copy->dest.offset + txreq->size > PAGE_SIZE) {
+ unsigned int first = PAGE_SIZE - gop->copy->dest.offset;
+
+ gop->copy->len = first;
+ gop->ptr += first;
+
+ gop->copy--;
+ gop->copy->source = gop->copy[-1].source;
+ gop->copy->source.offset += first;
+ gop->copy->dest.u.gmfn = virt_to_mfn(gop->ptr);
+ gop->copy->dest.domid = DOMID_SELF;
+ gop->copy->dest.offset = 0;
+ gop->copy->flags = GNTCOPY_source_gref;
+ gop->copy->len = txreq->size - first;
+ } else
+ gop->copy->len = txreq->size;
+
+ gop->ptr += gop->copy->len;
}
-static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
- struct sk_buff *skb,
- netif_tx_request_t *txp,
- gnttab_map_grant_ref_t *mop)
+void netbk_get_requests(netif_t *netif, struct sk_buff *skb,
+ netif_tx_request_t *txp, struct netbk_tx_gop *gop)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
skb_frag_t *frags = shinfo->frags;
- u16 pending_idx = *(u16 *)skb->data;
+ u16 pending_idx = netbk_tx_cb(skb)->pending_idx[0];
int i, start;
/* Skip first skb fragment if it is on same page as header fragment. */
start = (frag_get_pending_idx(frags) == pending_idx);
+ for (i = 0; i < netbk_tx_cb(skb)->copy_slots; ++i, txp++) {
+ pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
+
+ netbk_fill_tx_copy(txp, gop, netif->domid);
+
+ pending_tx_info[pending_idx].req = *txp;
+ netif_get(netif);
+ pending_tx_info[pending_idx].netif = netif;
+ netbk_tx_cb(skb)->pending_idx[1 + i] = pending_idx;
+ }
+
for (i = start; i < shinfo->nr_frags; i++, txp++) {
pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
- gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
+ gnttab_set_map_op(gop->map++, idx_to_kaddr(pending_idx),
GNTMAP_host_map | GNTMAP_readonly,
txp->gref, netif->domid);
@@ -1132,14 +1217,17 @@ static gnttab_map_grant_ref_t *netbk_get
frag_set_pending_idx(&frags[i], pending_idx);
}
- return mop;
+ if ((void *)gop->map > (void *)gop->copy && net_ratelimit())
+ netdev_warn(netif->dev, "Grant op overrun (%p > %p)\n",
+ gop->map, gop->copy);
}
-static int netbk_tx_check_mop(struct sk_buff *skb,
- gnttab_map_grant_ref_t **mopp)
+static int netbk_tx_check_gop(struct sk_buff *skb,
+ struct netbk_tx_gop *gop, bool hdr_copied)
{
- gnttab_map_grant_ref_t *mop = *mopp;
- u16 pending_idx = *(u16 *)skb->data;
+ gnttab_copy_t *cop = gop->copy;
+ gnttab_map_grant_ref_t *mop = gop->map;
+ u16 pending_idx = netbk_tx_cb(skb)->pending_idx[0];
netif_t *netif = pending_tx_info[pending_idx].netif;
netif_tx_request_t *txp;
struct skb_shared_info *shinfo = skb_shinfo(skb);
@@ -1147,8 +1235,18 @@ static int netbk_tx_check_mop(struct sk_
int i, err, start;
/* Check status of header. */
- err = mop->status;
- if (unlikely(err != GNTST_okay)) {
+ if (hdr_copied) {
+ err = (--cop)->status;
+ txp = &pending_tx_info[pending_idx].req;
+ if (txp->size > cop->len)
+ cmpxchg_local(&err, GNTST_okay, (--cop)->status);
+ make_tx_response(netif, txp,
+ err == GNTST_okay ? XEN_NETIF_RSP_OKAY
+ : XEN_NETIF_RSP_ERROR);
+ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+ netif_put(netif);
+ } else if (unlikely((err = mop->status) != GNTST_okay)) {
+ ++mop;
txp = &pending_tx_info[pending_idx].req;
make_tx_response(netif, txp, XEN_NETIF_RSP_ERROR);
pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
@@ -1156,19 +1254,34 @@ static int netbk_tx_check_mop(struct sk_
} else {
set_phys_to_machine(idx_to_pfn(pending_idx),
FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
- grant_tx_handle[pending_idx] = mop->handle;
+ grant_tx_handle[pending_idx] = mop++->handle;
}
/* Skip first skb fragment if it is on same page as header fragment. */
start = (frag_get_pending_idx(shinfo->frags) == pending_idx);
- for (i = start; i < nr_frags; i++) {
+ for (i = 0; i < netbk_tx_cb(skb)->copy_slots; ++i) {
+ int newerr = (--cop)->status;
+
+ pending_idx = netbk_tx_cb(skb)->pending_idx[1 + i];
+ txp = &pending_tx_info[pending_idx].req;
+ if (txp->size > cop->len)
+ cmpxchg_local(&newerr, GNTST_okay, (--cop)->status);
+ make_tx_response(netif, txp,
+ newerr == GNTST_okay ? XEN_NETIF_RSP_OKAY
+ : XEN_NETIF_RSP_ERROR);
+ cmpxchg_local(&err, GNTST_okay, newerr);
+ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+ netif_put(netif);
+ }
+
+ for (i = start; i < nr_frags; i++, mop++) {
int j, newerr;
pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
/* Check error status: if okay then remember grant handle. */
- newerr = (++mop)->status;
+ newerr = mop->status;
if (likely(newerr == GNTST_okay)) {
set_phys_to_machine(idx_to_pfn(pending_idx),
FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
@@ -1190,8 +1303,10 @@ static int netbk_tx_check_mop(struct sk_
continue;
/* First error: invalidate header and preceding fragments. */
- pending_idx = *((u16 *)skb->data);
- netif_idx_release(pending_idx);
+ if (!hdr_copied) {
+ pending_idx = netbk_tx_cb(skb)->pending_idx[0];
+ netif_idx_release(pending_idx);
+ }
for (j = start; j < i; j++) {
pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
netif_idx_release(pending_idx);
@@ -1201,7 +1316,11 @@ static int netbk_tx_check_mop(struct sk_
err = newerr;
}
- *mopp = mop + 1;
+ gop->map = mop;
+ gop->copy = cop;
+ if ((void *)mop > (void *)cop && net_ratelimit())
+ netdev_warn(netif->dev, "Grant op check overrun (%p > %p)\n",
+ mop, cop);
return err;
}
@@ -1291,20 +1410,23 @@ static void net_tx_action(unsigned long
{
struct sk_buff *skb;
netif_t *netif;
- netif_tx_request_t txreq;
- netif_tx_request_t txfrags[MAX_SKB_FRAGS];
+ netif_tx_request_t txreq, *txslot;
struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
u16 pending_idx;
RING_IDX i;
- gnttab_map_grant_ref_t *mop;
+ struct netbk_tx_gop gop;
+ multicall_entry_t mcl[2];
unsigned int data_len;
int ret, work_to_do;
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct netbk_tx_cb));
+
net_tx_action_dealloc();
- mop = tx_map_ops;
- while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
- !list_empty(&net_schedule_list)) {
+ gop.map = tx_map_ops;
+ gop.copy = tx_copy_ops + ARRAY_SIZEOF(tx_copy_ops);
+ while (NR_PENDING_REQS + XEN_NETIF_NR_SLOTS_MIN < MAX_PENDING_REQS
+ && !list_empty(&net_schedule_list)) {
/* Get a netif from the list with work to do. */
netif = poll_net_schedule_list();
/*
@@ -1386,7 +1508,8 @@ static void net_tx_action(unsigned long
continue;
}
- ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
+ txslot = netbk->tx.slots;
+ ret = netbk_count_requests(netif, &txreq, txslot, work_to_do);
if (unlikely(ret < 0))
continue;
@@ -1414,6 +1537,12 @@ static void net_tx_action(unsigned long
data_len = (txreq.size > PKT_PROT_LEN &&
ret < MAX_SKB_FRAGS) ?
PKT_PROT_LEN : txreq.size;
+ while (ret > MAX_SKB_FRAGS ||
+ (ret && (data_len + txslot->size <= PKT_PROT_LEN ||
+ netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB))) {
+ data_len += txslot++->size;
+ --ret;
+ }
skb = alloc_skb(data_len + 16 + NET_IP_ALIGN,
GFP_ATOMIC | __GFP_NOWARN);
@@ -1439,39 +1568,38 @@ static void net_tx_action(unsigned long
}
}
- gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
- GNTMAP_host_map | GNTMAP_readonly,
- txreq.gref, netif->domid);
- mop++;
-
memcpy(&pending_tx_info[pending_idx].req,
&txreq, sizeof(txreq));
pending_tx_info[pending_idx].netif = netif;
- *((u16 *)skb->data) = pending_idx;
+ netbk_tx_cb(skb)->pending_idx[0] = pending_idx;
+ netbk_tx_cb(skb)->copy_slots = txslot - netbk->tx.slots;
__skb_put(skb, data_len);
+ gop.ptr = skb->data;
skb_shinfo(skb)->nr_frags = ret;
- if (data_len < txreq.size)
+ if (data_len < txreq.size) {
+ gnttab_set_map_op(gop.map++, idx_to_kaddr(pending_idx),
+ GNTMAP_host_map | GNTMAP_readonly,
+ txreq.gref, netif->domid);
skb_shinfo(skb)->nr_frags++;
- else
+ } else {
+ netbk_fill_tx_copy(&txreq, &gop, netif->domid);
pending_idx = INVALID_PENDING_IDX;
+ }
frag_set_pending_idx(skb_shinfo(skb)->frags, pending_idx);
__skb_queue_tail(&tx_queue, skb);
pending_cons++;
- mop = netbk_get_requests(netif, skb, txfrags, mop);
+ netbk_get_requests(netif, skb, netbk->tx.slots, &gop);
netif->tx.req_cons = i;
netif_schedule_work(netif);
-
- if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
- break;
}
- if (mop == tx_map_ops)
+ if (skb_queue_empty(&tx_queue))
goto out;
/* NOTE: some maps may fail with GNTST_eagain, which could be successfully
@@ -1479,22 +1607,28 @@ static void net_tx_action(unsigned long
* req and let the frontend resend the relevant packet again. This is fine
* because it is unlikely that a network buffer will be paged out or shared,
* and therefore it is unlikely to fail with GNTST_eagain. */
- ret = HYPERVISOR_grant_table_op(
- GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
- BUG_ON(ret);
+ MULTI_grant_table_op(&mcl[0], GNTTABOP_copy, gop.copy,
+ tx_copy_ops + ARRAY_SIZE(tx_copy_ops) - gop.copy);
+ MULTI_grant_table_op(&mcl[1], GNTTABOP_map_grant_ref,
+ tx_map_ops, gop.map - tx_map_ops);
+ if (HYPERVISOR_multicall_check(mcl, 2, NULL))
+ BUG();
- mop = tx_map_ops;
+ gop.map = tx_map_ops;
+ gop.copy = tx_copy_ops + ARRAY_SIZE(tx_copy_ops);
while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
struct net_device *dev;
netif_tx_request_t *txp;
- pending_idx = *((u16 *)skb->data);
+ pending_idx = netbk_tx_cb(skb)->pending_idx[0];
netif = pending_tx_info[pending_idx].netif;
dev = netif->dev;
txp = &pending_tx_info[pending_idx].req;
+ data_len = skb->len;
- /* Check the remap error code. */
- if (unlikely(netbk_tx_check_mop(skb, &mop))) {
+ /* Check the remap/copy error code. */
+ if (unlikely(netbk_tx_check_gop(skb, &gop,
+ data_len >= txp->size))) {
netdev_dbg(dev, "netback grant failed.\n");
skb_shinfo(skb)->nr_frags = 0;
kfree_skb(skb);
@@ -1502,17 +1636,13 @@ static void net_tx_action(unsigned long
continue;
}
- data_len = skb->len;
- memcpy(skb->data,
- (void *)(idx_to_kaddr(pending_idx)|txp->offset),
- data_len);
if (data_len < txp->size) {
+ memcpy(skb->data,
+ (void *)(idx_to_kaddr(pending_idx) + txp->offset),
+ data_len);
/* Append the packet payload as a fragment. */
txp->offset += data_len;
txp->size -= data_len;
- } else {
- /* Schedule a response immediately. */
- netif_idx_release(pending_idx);
}
if (txp->flags & XEN_NETTXF_csum_blank)
@@ -1544,15 +1674,6 @@ static void net_tx_action(unsigned long
continue;
}
- if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
- unlikely(skb_linearize(skb))) {
- netdev_dbg(dev,
- "Can't linearize skb in net_tx_action.\n");
- kfree_skb(skb);
- dev->stats.rx_dropped++;
- continue;
- }
-
dev->stats.rx_bytes += skb->len;
dev->stats.rx_packets++;
@@ -1703,6 +1824,13 @@ static int __init netback_init(void)
if (!is_running_on_xen())
return -ENODEV;
+ BUILD_BUG_ON(XEN_NETIF_NR_SLOTS_MIN >= MAX_PENDING_REQS);
+ if (max_tx_slots < XEN_NETIF_NR_SLOTS_MIN) {
+ pr_info("netback: max_tx_slots too small (%u), using XEN_NETIF_NR_SLOTS_MIN (%d)\n",
+ max_tx_slots, XEN_NETIF_NR_SLOTS_MIN);
+ max_tx_slots = XEN_NETIF_NR_SLOTS_MIN;
+ }
+
/* We can increase reservation by this much in net_rx_action(). */
balloon_update_driver_allowance(NET_RX_RING_SIZE);
--- head.orig/drivers/xen/netfront/netfront.c 2014-01-07 16:36:20.000000000 +0100
+++ head/drivers/xen/netfront/netfront.c 2014-01-07 16:36:27.000000000 +0100
@@ -84,15 +84,15 @@ struct netfront_cb {
* For paravirtualised guests, flipping is the default.
*/
#ifdef CONFIG_XEN
-static int MODPARM_rx_copy = 0;
+static bool MODPARM_rx_copy;
module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)");
-static int MODPARM_rx_flip = 0;
+static bool MODPARM_rx_flip;
module_param_named(rx_flip, MODPARM_rx_flip, bool, 0);
MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)");
#else
-static const int MODPARM_rx_copy = 1;
-static const int MODPARM_rx_flip = 0;
+# define MODPARM_rx_copy true
+# define MODPARM_rx_flip false
#endif
#define RX_COPY_THRESHOLD 256
@@ -230,7 +230,7 @@ static void xennet_sysfs_delif(struct ne
#define xennet_sysfs_delif(dev) do { } while(0)
#endif
-static inline int xennet_can_sg(struct net_device *dev)
+static inline bool xennet_can_sg(struct net_device *dev)
{
return dev->features & NETIF_F_SG;
}
@@ -2056,7 +2056,8 @@ static void network_set_multicast_list(s
{
}
-static u32 xennet_fix_features(struct net_device *dev, u32 features)
+static netdev_features_t xennet_fix_features(struct net_device *dev,
+ netdev_features_t features)
{
struct netfront_info *np = netdev_priv(dev);
int val;
@@ -2082,7 +2083,8 @@ static u32 xennet_fix_features(struct ne
return features;
}
-static int xennet_set_features(struct net_device *dev, u32 features)
+static int xennet_set_features(struct net_device *dev,
+ netdev_features_t features)
{
if (!(features & NETIF_F_SG) && dev->mtu > ETH_DATA_LEN) {
netdev_info(dev, "Reducing MTU because no SG offload");
@@ -2266,7 +2268,7 @@ static int __init netif_init(void)
}
if (!MODPARM_rx_flip && !MODPARM_rx_copy)
- MODPARM_rx_copy = 1; /* Default is to copy. */
+ MODPARM_rx_copy = true; /* Default is to copy. */
#endif
netif_init_accel();
--- head.orig/drivers/xen/pcifront/pci_op.c 2012-04-04 10:24:26.000000000 +0200
+++ head/drivers/xen/pcifront/pci_op.c 2013-01-25 15:10:03.000000000 +0100
@@ -12,7 +12,7 @@
#include <xen/evtchn.h>
#include "pcifront.h"
-static int verbose_request;
+static bool verbose_request;
module_param(verbose_request, bool, 0644);
static void pcifront_init_sd(struct pcifront_sd *sd,
--- head.orig/drivers/xen/pcifront/xenbus.c 2012-03-12 13:53:21.000000000 +0100
+++ head/drivers/xen/pcifront/xenbus.c 2012-03-12 13:55:45.000000000 +0100
@@ -371,7 +371,7 @@ static int pcifront_detach_devices(struc
pci_dev = pci_get_slot(pci_bus, PCI_DEVFN(slot, func));
if(!pci_dev) {
dev_dbg(&pdev->xdev->dev,
- "Cannot get PCI device %04x:%02x:%02x.%02x\n",
+ "Cannot get PCI device %04x:%02x:%02x.%u\n",
domain, bus, slot, func);
continue;
}
@@ -379,7 +379,7 @@ static int pcifront_detach_devices(struc
pci_dev_put(pci_dev);
dev_dbg(&pdev->xdev->dev,
- "PCI device %04x:%02x:%02x.%02x removed.\n",
+ "PCI device %04x:%02x:%02x.%u removed.\n",
domain, bus, slot, func);
}
--- head.orig/drivers/xen/scsiback/scsiback.c 2013-06-20 15:27:44.000000000 +0200
+++ head/drivers/xen/scsiback/scsiback.c 2013-01-25 15:10:30.000000000 +0100
@@ -56,7 +56,7 @@ static unsigned int vscsiif_reqs = 128;
module_param_named(reqs, vscsiif_reqs, uint, 0);
MODULE_PARM_DESC(reqs, "Number of scsiback requests to allocate");
-static int log_print_stat;
+static bool log_print_stat;
module_param(log_print_stat, bool, 0644);
#define SCSIBACK_INVALID_HANDLE (~0)
--- head.orig/drivers/xen/xen-pciback/pci_stub.c 2013-08-09 15:23:41.000000000 +0200
+++ head/drivers/xen/xen-pciback/pci_stub.c 2013-08-09 15:29:04.000000000 +0200
@@ -288,7 +288,9 @@ void pcistub_put_pci_dev(struct pci_dev
xen_pcibk_config_free_dyn_fields(found_psdev->dev);
xen_pcibk_config_reset_dev(found_psdev->dev);
+#ifndef CONFIG_XEN
xen_unregister_device_domain_owner(found_psdev->dev);
+#endif
spin_lock_irqsave(&found_psdev->lock, flags);
found_psdev->pdev = NULL;
--- head.orig/drivers/xen/xenbus/Makefile 2011-02-02 17:08:58.000000000 +0100
+++ head/drivers/xen/xenbus/Makefile 2012-02-09 13:58:18.000000000 +0100
@@ -1,12 +1,17 @@
obj-y += xenbus_client.o xenbus_comms.o xenbus_xs.o xenbus_probe.o
-obj-$(CONFIG_XEN_BACKEND) += xenbus_be.o
+backend-standalone-$(CONFIG_XEN) += xenbus_be.o
+obj-$(CONFIG_PARAVIRT_XEN) += xenbus_dev_frontend.o
xenbus_be-objs =
xenbus_be-objs += xenbus_backend_client.o
+xenbus_be-objs += xenbus_dev_backend.o
xenbus-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
obj-y += $(xenbus-y) $(xenbus-m)
obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o
obj-$(CONFIG_PARAVIRT_XEN_BACKEND) += xenbus_probe_backend.o
+backend-standalone-$(CONFIG_PARAVIRT_XEN) += xenbus_dev_backend.o
obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
+
+obj-$(CONFIG_XEN_BACKEND) += $(backend-standalone-y)
--- head.orig/drivers/xen/xenbus/xenbus_client.c 2011-11-28 10:14:06.000000000 +0100
+++ head/drivers/xen/xenbus/xenbus_client.c 2012-02-17 09:16:09.000000000 +0100
@@ -36,18 +36,42 @@
#include <xen/gnttab.h>
#else
#include <linux/types.h>
+#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/export.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/page.h>
#include <xen/interface/xen.h>
#include <xen/interface/event_channel.h>
+#include <xen/balloon.h>
#include <xen/events.h>
#include <xen/grant_table.h>
#endif
#include <xen/xenbus.h>
+#include <xen/xen.h>
-#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#if defined(CONFIG_PARAVIRT_XEN)
+#include "xenbus_probe.h"
+
+struct xenbus_map_node {
+ struct list_head next;
+ union {
+ struct vm_struct *area; /* PV */
+ struct page *page; /* HVM */
+ };
+ grant_handle_t handle;
+};
+
+static DEFINE_SPINLOCK(xenbus_valloc_lock);
+static LIST_HEAD(xenbus_valloc_pages);
+
+struct xenbus_ring_ops {
+ int (*map)(struct xenbus_device *dev, grant_ref_t gnt, void **vaddr);
+ int (*unmap)(struct xenbus_device *dev, void *vaddr);
+};
+
+static const struct xenbus_ring_ops *ring_ops __read_mostly;
+#elif defined(HAVE_XEN_PLATFORM_COMPAT_H)
#include <xen/platform-compat.h>
#endif
@@ -418,19 +442,33 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
*/
int xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t gnt_ref, void **vaddr)
{
+ return ring_ops->map(dev, gnt_ref, vaddr);
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
+
+static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
+ grant_ref_t gnt_ref, void **vaddr)
+{
struct gnttab_map_grant_ref op = {
.flags = GNTMAP_host_map | GNTMAP_contains_pte,
.ref = gnt_ref,
.dom = dev->otherend_id,
};
+ struct xenbus_map_node *node;
struct vm_struct *area;
pte_t *pte;
*vaddr = NULL;
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
area = alloc_vm_area(PAGE_SIZE, &pte);
- if (!area)
+ if (!area) {
+ kfree(node);
return -ENOMEM;
+ }
op.host_addr = arbitrary_virt_to_machine(pte).maddr;
@@ -439,19 +477,59 @@ int xenbus_map_ring_valloc(struct xenbus
if (op.status != GNTST_okay) {
free_vm_area(area);
+ kfree(node);
xenbus_dev_fatal(dev, op.status,
"mapping in shared page %d from domain %d",
gnt_ref, dev->otherend_id);
return op.status;
}
- /* Stuff the handle in an unused field */
- area->phys_addr = (unsigned long)op.handle;
+ node->handle = op.handle;
+ node->area = area;
+
+ spin_lock(&xenbus_valloc_lock);
+ list_add(&node->next, &xenbus_valloc_pages);
+ spin_unlock(&xenbus_valloc_lock);
*vaddr = area->addr;
return 0;
}
-EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
+
+static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
+ grant_ref_t gnt_ref, void **vaddr)
+{
+ struct xenbus_map_node *node;
+ int err;
+ void *addr;
+
+ *vaddr = NULL;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ err = alloc_xenballooned_pages(1, &node->page, false /* lowmem */);
+ if (err)
+ goto out_err;
+
+ addr = pfn_to_kaddr(page_to_pfn(node->page));
+
+ err = xenbus_map_ring(dev, gnt_ref, &node->handle, addr);
+ if (err)
+ goto out_err;
+
+ spin_lock(&xenbus_valloc_lock);
+ list_add(&node->next, &xenbus_valloc_pages);
+ spin_unlock(&xenbus_valloc_lock);
+
+ *vaddr = addr;
+ return 0;
+
+ out_err:
+ free_xenballooned_pages(1, &node->page);
+ kfree(node);
+ return err;
+}
/**
@@ -471,12 +549,10 @@ EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc
int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t gnt_ref,
grant_handle_t *handle, void *vaddr)
{
- struct gnttab_map_grant_ref op = {
- .host_addr = (unsigned long)vaddr,
- .flags = GNTMAP_host_map,
- .ref = gnt_ref,
- .dom = dev->otherend_id,
- };
+ struct gnttab_map_grant_ref op;
+
+ gnttab_set_map_op(&op, (phys_addr_t)vaddr, GNTMAP_host_map, gnt_ref,
+ dev->otherend_id);
if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
BUG();
@@ -507,32 +583,36 @@ EXPORT_SYMBOL_GPL(xenbus_map_ring);
*/
int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
{
- struct vm_struct *area;
+ return ring_ops->unmap(dev, vaddr);
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+
+static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr)
+{
+ struct xenbus_map_node *node;
struct gnttab_unmap_grant_ref op = {
.host_addr = (unsigned long)vaddr,
};
unsigned int level;
- /* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr)
- * method so that we don't have to muck with vmalloc internals here.
- * We could force the user to hang on to their struct vm_struct from
- * xenbus_map_ring_valloc, but these 6 lines considerably simplify
- * this API.
- */
- read_lock(&vmlist_lock);
- for (area = vmlist; area != NULL; area = area->next) {
- if (area->addr == vaddr)
- break;
+ spin_lock(&xenbus_valloc_lock);
+ list_for_each_entry(node, &xenbus_valloc_pages, next) {
+ if (node->area->addr == vaddr) {
+ list_del(&node->next);
+ goto found;
+ }
}
- read_unlock(&vmlist_lock);
+ node = NULL;
+ found:
+ spin_unlock(&xenbus_valloc_lock);
- if (!area) {
+ if (!node) {
xenbus_dev_error(dev, -ENOENT,
"can't find mapped virtual address %p", vaddr);
return GNTST_bad_virt_addr;
}
- op.handle = (grant_handle_t)area->phys_addr;
+ op.handle = node->handle;
op.host_addr = arbitrary_virt_to_machine(
lookup_address((unsigned long)vaddr, &level)).maddr;
@@ -540,16 +620,50 @@ int xenbus_unmap_ring_vfree(struct xenbu
BUG();
if (op.status == GNTST_okay)
- free_vm_area(area);
+ free_vm_area(node->area);
else
xenbus_dev_error(dev, op.status,
"unmapping page at handle %d error %d",
- (int16_t)area->phys_addr, op.status);
+ node->handle, op.status);
+ kfree(node);
return op.status;
}
-EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
+{
+ int rv;
+ struct xenbus_map_node *node;
+ void *addr;
+
+ spin_lock(&xenbus_valloc_lock);
+ list_for_each_entry(node, &xenbus_valloc_pages, next) {
+ addr = pfn_to_kaddr(page_to_pfn(node->page));
+ if (addr == vaddr) {
+ list_del(&node->next);
+ goto found;
+ }
+ }
+ node = NULL;
+ found:
+ spin_unlock(&xenbus_valloc_lock);
+
+ if (!node) {
+ xenbus_dev_error(dev, -ENOENT,
+ "can't find mapped virtual address %p", vaddr);
+ return GNTST_bad_virt_addr;
+ }
+
+ rv = xenbus_unmap_ring(dev, node->handle, addr);
+
+ if (!rv)
+ free_xenballooned_pages(1, &node->page);
+ else
+ WARN(1, "Leaking %p\n", vaddr);
+
+ kfree(node);
+ return rv;
+}
/**
* xenbus_unmap_ring
@@ -564,10 +678,9 @@ EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfre
int xenbus_unmap_ring(struct xenbus_device *dev,
grant_handle_t handle, void *vaddr)
{
- struct gnttab_unmap_grant_ref op = {
- .host_addr = (unsigned long)vaddr,
- .handle = handle,
- };
+ struct gnttab_unmap_grant_ref op;
+
+ gnttab_set_unmap_op(&op, (phys_addr_t)vaddr, GNTMAP_host_map, handle);
if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
BUG();
@@ -600,3 +713,23 @@ enum xenbus_state xenbus_read_driver_sta
return result;
}
EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
+
+#if !defined(CONFIG_XEN) && !defined(MODULE)
+static const struct xenbus_ring_ops ring_ops_pv = {
+ .map = xenbus_map_ring_valloc_pv,
+ .unmap = xenbus_unmap_ring_vfree_pv,
+};
+
+static const struct xenbus_ring_ops ring_ops_hvm = {
+ .map = xenbus_map_ring_valloc_hvm,
+ .unmap = xenbus_unmap_ring_vfree_hvm,
+};
+
+void __init xenbus_ring_ops_init(void)
+{
+ if (xen_pv_domain())
+ ring_ops = &ring_ops_pv;
+ else
+ ring_ops = &ring_ops_hvm;
+}
+#endif
--- head.orig/drivers/xen/xenbus/xenbus_comms.h 2011-04-11 13:43:15.000000000 +0200
+++ head/drivers/xen/xenbus/xenbus_comms.h 2012-02-09 12:32:50.000000000 +0100
@@ -31,6 +31,8 @@
#ifndef _XENBUS_COMMS_H
#define _XENBUS_COMMS_H
+#include <linux/fs.h>
+
int xs_init(void);
int xb_init_comms(void);
@@ -43,6 +45,8 @@ int xs_input_avail(void);
extern struct xenstore_domain_interface *xen_store_interface;
extern int xen_store_evtchn;
+extern const struct file_operations xen_xenbus_fops;
+
/* For xenbus internal use. */
enum {
XENBUS_XSD_UNCOMMITTED = 0,
--- head.orig/drivers/xen/xenbus/xenbus_dev_backend.c 2014-05-06 08:31:14.000000000 +0200
+++ head/drivers/xen/xenbus/xenbus_dev_backend.c 2013-08-09 15:29:19.000000000 +0200
@@ -9,7 +9,9 @@
#include <linux/capability.h>
#include <xen/xen.h>
+#ifdef CONFIG_PARAVIRT_XEN
#include <xen/page.h>
+#endif
#include <xen/xenbus.h>
#include <xen/xenbus_dev.h>
#include <xen/grant_table.h>
@@ -101,7 +103,7 @@ static int xenbus_backend_mmap(struct fi
return -EINVAL;
if (remap_pfn_range(vma, vma->vm_start,
- virt_to_pfn(xen_store_interface),
+ PFN_DOWN(__pa(xen_store_interface)),
size, vma->vm_page_prot))
return -EAGAIN;
--- head.orig/drivers/xen/xenbus/xenbus_probe.c 2012-03-12 13:54:30.000000000 +0100
+++ head/drivers/xen/xenbus/xenbus_probe.c 2012-03-12 13:55:51.000000000 +0100
@@ -84,10 +84,10 @@
#endif
int xen_store_evtchn;
-PARAVIRT_EXPORT_SYMBOL(xen_store_evtchn);
+EXPORT_SYMBOL_GPL(xen_store_evtchn);
struct xenstore_domain_interface *xen_store_interface;
-PARAVIRT_EXPORT_SYMBOL(xen_store_interface);
+EXPORT_SYMBOL_GPL(xen_store_interface);
static unsigned long xen_store_mfn;
@@ -1330,6 +1330,8 @@ xenbus_init(void)
xenbus_dev_init();
#else /* !defined(CONFIG_XEN) && !defined(MODULE) */
+ xenbus_ring_ops_init();
+
if (xen_hvm_domain()) {
uint64_t v = 0;
err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
--- head.orig/drivers/xen/xenbus/xenbus_probe.h 2014-05-06 08:32:24.000000000 +0200
+++ head/drivers/xen/xenbus/xenbus_probe.h 2012-06-08 11:23:49.000000000 +0200
@@ -104,4 +104,6 @@ extern void xenbus_otherend_changed(stru
extern int xenbus_read_otherend_details(struct xenbus_device *xendev,
char *id_node, char *path_node);
+void xenbus_ring_ops_init(void);
+
#endif
--- head.orig/include/linux/pci_ids.h 2014-05-06 08:31:14.000000000 +0200
+++ head/include/linux/pci_ids.h 2012-06-21 08:31:17.000000000 +0200
@@ -75,6 +75,7 @@
#define PCI_CLASS_SYSTEM_RTC 0x0803
#define PCI_CLASS_SYSTEM_PCI_HOTPLUG 0x0804
#define PCI_CLASS_SYSTEM_SDHCI 0x0805
+#define PCI_CLASS_SYSTEM_IOMMU 0x0806
#define PCI_CLASS_SYSTEM_OTHER 0x0880
#define PCI_BASE_CLASS_INPUT 0x09
--- head.orig/include/xen/balloon.h 2012-02-03 13:44:44.000000000 +0100
+++ head/include/xen/balloon.h 2012-02-09 12:32:50.000000000 +0100
@@ -88,11 +88,11 @@ void free_xenballooned_pages(int nr_page
#endif /* CONFIG_PARAVIRT_XEN */
-struct sys_device;
+struct device;
#ifdef CONFIG_XEN_SELFBALLOONING
-extern int register_xen_selfballooning(struct sys_device *sysdev);
+extern int register_xen_selfballooning(struct device *dev);
#else
-static inline int register_xen_selfballooning(struct sys_device *sysdev)
+static inline int register_xen_selfballooning(struct device *dev)
{
return -ENOSYS;
}
--- head.orig/include/xen/blkif.h 2014-02-06 14:38:26.000000000 +0100
+++ head/include/xen/blkif.h 2014-02-06 14:39:10.000000000 +0100
@@ -53,7 +53,7 @@ struct blkif_x86_32_request {
};
struct blkif_x86_32_discard {
uint8_t operation; /* BLKIF_OP_DISCARD */
- uint8_t reserved; /* */
+ uint8_t flag; /* BLKIF_DISCARD_* */
blkif_vdev_t handle; /* same as for read/write requests */
uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk */
@@ -81,7 +81,7 @@ struct blkif_x86_64_request {
};
struct blkif_x86_64_discard {
uint8_t operation; /* BLKIF_OP_DISCARD */
- uint8_t reserved; /* */
+ uint8_t flag; /* BLKIF_DISCARD_* */
blkif_vdev_t handle; /* sane as for read/write requests */
uint64_t __attribute__((__aligned__(8))) id;
blkif_sector_t sector_number;/* start sector idx on disk */
--- head.orig/include/xen/evtchn.h 2012-10-23 15:45:43.000000000 +0200
+++ head/include/xen/evtchn.h 2012-10-23 15:56:48.000000000 +0200
@@ -56,6 +56,7 @@ struct irq_cfg {
};
};
struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
+static inline int evtchn_make_refcounted(unsigned int evtchn) { return 0; }
#endif
/*
--- head.orig/include/xen/interface/grant_table.h 2013-06-20 15:22:34.000000000 +0200
+++ head/include/xen/interface/grant_table.h 2013-06-20 15:28:42.000000000 +0200
@@ -124,7 +124,9 @@ typedef uint32_t grant_ref_t;
* Version 1 of the grant table entry structure is maintained purely
* for backwards compatibility. New guests should use version 2.
*/
-#if __XEN_INTERFACE_VERSION__ < 0x0003020a
+#if defined(CONFIG_PARAVIRT_XEN)
+#define grant_entry grant_entry_v1
+#elif __XEN_INTERFACE_VERSION__ < 0x0003020a
#define grant_entry_v1 grant_entry
#define grant_entry_v1_t grant_entry_t
#endif
@@ -212,7 +214,7 @@ typedef struct grant_entry_v1 grant_entr
* The interface by which domains use grant references does not depend
* on the grant table version in use by the other domain.
*/
-#if __XEN_INTERFACE_VERSION__ >= 0x0003020a
+#if defined(CONFIG_PARAVIRT_XEN) || __XEN_INTERFACE_VERSION__ >= 0x0003020a
/*
* Version 1 and version 2 grant entries share a common prefix. The
* fields of the prefix are documented as part of struct
@@ -305,7 +307,7 @@ typedef uint16_t grant_status_t;
#define GNTTABOP_copy 5
#define GNTTABOP_query_size 6
#define GNTTABOP_unmap_and_replace 7
-#if __XEN_INTERFACE_VERSION__ >= 0x0003020a
+#if defined(CONFIG_PARAVIRT_XEN) || __XEN_INTERFACE_VERSION__ >= 0x0003020a
#define GNTTABOP_set_version 8
#define GNTTABOP_get_status_frames 9
#define GNTTABOP_get_version 10
@@ -513,10 +515,11 @@ struct gnttab_unmap_and_replace {
/* OUT parameters. */
int16_t status; /* => enum grant_status */
};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace);
typedef struct gnttab_unmap_and_replace gnttab_unmap_and_replace_t;
DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t);
-#if __XEN_INTERFACE_VERSION__ >= 0x0003020a
+#if defined(CONFIG_PARAVIRT_XEN) || __XEN_INTERFACE_VERSION__ >= 0x0003020a
/*
* GNTTABOP_set_version: Request a particular version of the grant
* table shared table structure. This operation can only be performed
@@ -528,6 +531,7 @@ struct gnttab_set_version {
/* IN/OUT parameters */
uint32_t version;
};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_set_version);
typedef struct gnttab_set_version gnttab_set_version_t;
DEFINE_XEN_GUEST_HANDLE(gnttab_set_version_t);
@@ -552,6 +556,7 @@ struct gnttab_get_status_frames {
int16_t status; /* => enum grant_status */
XEN_GUEST_HANDLE(uint64_t) frame_list;
};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_status_frames);
typedef struct gnttab_get_status_frames gnttab_get_status_frames_t;
DEFINE_XEN_GUEST_HANDLE(gnttab_get_status_frames_t);
@@ -566,6 +571,7 @@ struct gnttab_get_version {
/* OUT parameters */
uint32_t version;
};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
typedef struct gnttab_get_version gnttab_get_version_t;
DEFINE_XEN_GUEST_HANDLE(gnttab_get_version_t);
--- head.orig/include/xen/interface/io/blkif.h 2014-04-01 11:38:40.000000000 +0200
+++ head/include/xen/interface/io/blkif.h 2014-01-30 10:22:48.000000000 +0100
@@ -532,25 +532,40 @@ struct blkif_request_segment {
*/
struct blkif_request {
uint8_t operation; /* BLKIF_OP_??? */
+#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
uint8_t nr_segments; /* number of segments */
blkif_vdev_t handle; /* only for read/write requests */
uint64_t id; /* private guest value, echoed in resp */
-#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
#else
union {
- struct blkif_request_rw {
+ struct __attribute__((__packed__)) blkif_request_rw {
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+#ifdef CONFIG_X86_64
+ uint32_t _pad1; /* offsetof(blkif_request,u.rw.id) == 8 */
+#endif
+ uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
} rw;
- struct blkif_request_discard {
+ struct __attribute__((__packed__)) blkif_request_discard {
+ uint8_t flag; /* BLKIF_DISCARD_SECURE or zero. */
+#define BLKIF_DISCARD_SECURE (1<<0) /* ignored if discard-secure=0 */
+ blkif_vdev_t _pad1; /* only for read/write requests */
+#ifdef CONFIG_X86_64
+ uint32_t _pad2; /* offsetof(blkif_req..,u.discard.id)==8*/
+#endif
+ uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;
- uint64_t nr_sectors;
+ uint64_t nr_sectors;
+ uint8_t _pad3;
} discard;
} u;
+} __attribute__((__packed__));
#endif
-};
typedef struct blkif_request blkif_request_t;
#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
--- head.orig/lib/swiotlb-xen.c 2011-07-01 15:19:35.000000000 +0200
+++ head/lib/swiotlb-xen.c 2012-02-09 12:32:50.000000000 +0100
@@ -114,11 +114,11 @@ setup_io_tlb_npages(char *str)
__setup("swiotlb=", setup_io_tlb_npages);
/* make io_tlb_overflow tunable too? */
-unsigned long swioltb_nr_tbl(void)
+unsigned long swiotlb_nr_tbl(void)
{
return io_tlb_nslabs;
}
-
+EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
/* Note that this doesn't work with highmem page */
static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
volatile void *address)