Blob Blame History Raw
From: jbeulich@novell.com
Subject: replace Xen's custom time handling with such using GENERIC_CLOCKEVENTS infrastructure
Patch-mainline: n/a

Once validated this could be merged into the 2.6.?? patch.

--- head.orig/arch/x86/Kconfig	2012-05-08 10:54:33.000000000 +0200
+++ head/arch/x86/Kconfig	2012-02-10 11:43:34.000000000 +0100
@@ -105,7 +105,6 @@ config CLOCKSOURCE_WATCHDOG
 
 config GENERIC_CLOCKEVENTS
 	def_bool y
-	depends on !XEN
 
 config ARCH_CLOCKSOURCE_DATA
 	def_bool y
--- head.orig/arch/x86/include/mach-xen/asm/hypervisor.h	2012-05-11 16:45:57.000000000 +0200
+++ head/arch/x86/include/mach-xen/asm/hypervisor.h	2012-05-11 16:22:53.000000000 +0200
@@ -71,7 +71,6 @@ extern start_info_t *xen_start_info;
 #define init_hypervisor_platform() init_hypervisor(&boot_cpu_data)
 
 DECLARE_PER_CPU(struct vcpu_runstate_info, runstate);
-struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu);
 #define vcpu_running(cpu) (per_cpu(runstate.state, cpu) == RUNSTATE_running)
 
 /* arch/xen/kernel/evtchn.c */
--- head.orig/arch/x86/include/mach-xen/asm/irqflags.h	2011-09-08 16:54:08.000000000 +0200
+++ head/arch/x86/include/mach-xen/asm/irqflags.h	2012-05-24 09:13:47.000000000 +0200
@@ -5,6 +5,7 @@
 
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
+#include <xen/interface/vcpu.h>
 /*
  * The use of 'barrier' in the following reflects their use as local-lock
  * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
@@ -33,10 +34,6 @@
 
 #define xen_irq_enable() xen_restore_fl(0)
 
-void xen_safe_halt(void);
-
-void xen_halt(void);
-
 #define arch_local_save_flags() xen_save_fl()
 
 #define arch_local_irq_restore(flags) xen_restore_fl(flags)
@@ -49,19 +46,16 @@ void xen_halt(void);
  * Used in the idle loop; sti takes one instruction cycle
  * to complete:
  */
-static inline void arch_safe_halt(void)
-{
-	xen_safe_halt();
-}
+#define arch_safe_halt HYPERVISOR_block
 
 /*
  * Used when interrupts are already enabled or to
  * shutdown the processor:
  */
-static inline void halt(void)
-{
-	xen_halt();
-}
+#define halt() VOID(irqs_disabled()					\
+		    ? HYPERVISOR_vcpu_op(VCPUOP_down,			\
+					 smp_processor_id(), NULL)	\
+		    : 0)
 
 /*
  * For spinlocks, etc:
--- head.orig/arch/x86/kernel/time-xen.c	2012-04-11 17:01:05.000000000 +0200
+++ head/arch/x86/kernel/time-xen.c	2012-04-11 17:02:27.000000000 +0200
@@ -20,15 +20,12 @@
 #include <linux/cpufreq.h>
 #include <linux/clocksource.h>
 
-extern seqlock_t xtime_lock;
-extern void do_timer(unsigned long ticks);
-
 #include <asm/vsyscall.h>
 #include <asm/delay.h>
 #include <asm/time.h>
 #include <asm/timer.h>
 
-#include <xen/evtchn.h>
+#include <xen/clock.h>
 #include <xen/sysctl.h>
 #include <xen/interface/vcpu.h>
 
@@ -54,13 +51,7 @@ static DEFINE_PER_CPU(struct shadow_time
 static struct timespec shadow_tv;
 static u32 shadow_tv_version;
 
-/* Keep track of last time we did processing/updating of jiffies and xtime. */
-static u64 processed_system_time;   /* System time (ns) at last processing. */
-static DEFINE_PER_CPU(u64, processed_system_time);
-
-/* How much CPU time was spent blocked and how much was 'stolen'? */
-static DEFINE_PER_CPU(u64, processed_stolen_time);
-static DEFINE_PER_CPU(u64, processed_blocked_time);
+static u64 jiffies_bias, system_time_bias;
 
 /* Current runstate of each CPU (updated automatically by the hypervisor). */
 DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
@@ -68,16 +59,6 @@ DEFINE_PER_CPU(struct vcpu_runstate_info
 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
 #define NS_PER_TICK (1000000000LL/HZ)
 
-static struct vcpu_set_periodic_timer xen_set_periodic_tick = {
-	.period_ns = NS_PER_TICK
-};
-
-/*
- * GCC 4.3 can turn loops over an induction variable into division. We do
- * not support arbitrary 64-bit division, and so must break the induction.
- */
-#define clobber_induction_variable(v) asm ( "" : "+r" (v) )
-
 /* Does this guest OS track Xen time, or set its wall clock independently? */
 static int independent_wallclock = 0;
 static int __init __independent_wallclock(char *str)
@@ -151,6 +132,11 @@ static u64 get_nsec_offset(struct shadow
 	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
 }
 
+static inline u64 processed_system_time(u64 jiffies_64)
+{
+	return (jiffies_64 - jiffies_bias) * NS_PER_TICK + system_time_bias;
+}
+
 static void update_wallclock(bool local)
 {
 	static DEFINE_MUTEX(uwc_mutex);
@@ -167,7 +153,7 @@ static void update_wallclock(bool local)
 	} while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
 
 	if (local) {
-		u64 tmp = processed_system_time;
+		u64 tmp = processed_system_time(get_jiffies_64());
 		long nsec = do_div(tmp, NSEC_PER_SEC);
 		struct timespec tv;
 
@@ -185,6 +171,14 @@ static void _update_wallclock(struct wor
 }
 static DECLARE_WORK(update_wallclock_work, _update_wallclock);
 
+void xen_check_wallclock_update(void)
+{
+	if (shadow_tv_version != HYPERVISOR_shared_info->wc_version
+	    && !is_initial_xendomain() && !independent_wallclock
+	    && keventd_up())
+		schedule_work(&update_wallclock_work);
+}
+
 /*
  * Reads a consistent set of time-base values from Xen, into a shadow data
  * area.
@@ -286,7 +280,7 @@ static void sync_xen_wallclock(unsigned 
 	op.cmd = XENPF_settime;
 	op.u.settime.secs        = now.tv_sec;
 	op.u.settime.nsecs       = now.tv_nsec;
-	op.u.settime.system_time = processed_system_time;
+	op.u.settime.system_time = processed_system_time(get_jiffies_64());
 	WARN_ON(HYPERVISOR_platform_op(&op));
 
 	update_wallclock(false);
@@ -296,7 +290,7 @@ static void sync_xen_wallclock(unsigned 
 }
 #endif /* CONFIG_XEN_PRIVILEGED_GUEST */
 
-static unsigned long long local_clock(void)
+unsigned long long xen_local_clock(void)
 {
 	unsigned int cpu = get_cpu();
 	struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
@@ -331,7 +325,7 @@ unsigned long xen_read_wallclock(void)
 		rmb();
 	} while ((s->wc_version & 1) | (version ^ s->wc_version));
 
-	delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
+	delta = xen_local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
 	do_div(delta, NSEC_PER_SEC);
 
 	return delta;
@@ -388,7 +382,7 @@ unsigned long long sched_clock(void)
 	 */
 	preempt_disable();
 
-	now = local_clock();
+	now = xen_local_clock();
 
 	get_runstate_snapshot(&runstate);
 
@@ -431,136 +425,6 @@ unsigned long profile_pc(struct pt_regs 
 }
 EXPORT_SYMBOL(profile_pc);
 
-/*
- * Default timer interrupt handler
- */
-static irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
-	s64 delta, delta_cpu, stolen, blocked;
-	unsigned int i, cpu = smp_processor_id();
-	struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
-	struct vcpu_runstate_info runstate;
-
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
-
-	do {
-		get_time_values_from_xen(cpu);
-
-		/* Obtain a consistent snapshot of elapsed wallclock cycles. */
-		delta = delta_cpu =
-			shadow->system_timestamp + get_nsec_offset(shadow);
-		delta     -= processed_system_time;
-		delta_cpu -= per_cpu(processed_system_time, cpu);
-
-		get_runstate_snapshot(&runstate);
-	} while (!time_values_up_to_date(cpu));
-
-	if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
-	     unlikely(delta_cpu < -(s64)permitted_clock_jitter))
-	    && printk_ratelimit()) {
-		printk("Timer ISR/%u: Time went backwards: "
-		       "delta=%lld delta_cpu=%lld shadow=%lld "
-		       "off=%lld processed=%lld cpu_processed=%lld\n",
-		       cpu, delta, delta_cpu, shadow->system_timestamp,
-		       (s64)get_nsec_offset(shadow),
-		       processed_system_time,
-		       per_cpu(processed_system_time, cpu));
-		for (i = 0; i < num_online_cpus(); i++)
-			printk(" %d: %lld\n", i,
-			       per_cpu(processed_system_time, i));
-	}
-
-	/* System-wide jiffy work. */
-	if (delta >= NS_PER_TICK) {
-		do_div(delta, NS_PER_TICK);
-		processed_system_time += delta * NS_PER_TICK;
-		while (delta > HZ) {
-			clobber_induction_variable(delta);
-			do_timer(HZ);
-			delta -= HZ;
-		}
-		do_timer(delta);
-	}
-
-	write_sequnlock(&xtime_lock);
-
-	if (shadow_tv_version != HYPERVISOR_shared_info->wc_version
-	    && !is_initial_xendomain() && !independent_wallclock
-	    && keventd_up())
-		schedule_work(&update_wallclock_work);
-
-	/*
-	 * Account stolen ticks.
-	 * ensures that the ticks are accounted as stolen.
-	 */
-	stolen = runstate.time[RUNSTATE_runnable]
-		 + runstate.time[RUNSTATE_offline]
-		 - per_cpu(processed_stolen_time, cpu);
-	if ((stolen > 0) && (delta_cpu > 0)) {
-		delta_cpu -= stolen;
-		if (unlikely(delta_cpu < 0))
-			stolen += delta_cpu; /* clamp local-time progress */
-		do_div(stolen, NS_PER_TICK);
-		per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
-		per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
-		account_steal_ticks(stolen);
-	}
-
-	/*
-	 * Account blocked ticks.
-	 * ensures that the ticks are accounted as idle/wait.
-	 */
-	blocked = runstate.time[RUNSTATE_blocked]
-		  - per_cpu(processed_blocked_time, cpu);
-	if ((blocked > 0) && (delta_cpu > 0)) {
-		delta_cpu -= blocked;
-		if (unlikely(delta_cpu < 0))
-			blocked += delta_cpu; /* clamp local-time progress */
-		do_div(blocked, NS_PER_TICK);
-		per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
-		per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
-		account_idle_ticks(blocked);
-	}
-
-	/* Account user/system ticks. */
-	if (delta_cpu > 0) {
-		cputime_t ct;
-
-		do_div(delta_cpu, NS_PER_TICK);
-		per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
-		ct = jiffies_to_cputime(delta_cpu);
-		if (user_mode_vm(get_irq_regs()))
-			account_user_time(current, ct, cputime_to_scaled(ct));
-		else if (current != idle_task(cpu)
-			 || irq_count() != HARDIRQ_OFFSET)
-			account_system_time(current, HARDIRQ_OFFSET,
-					    ct, cputime_to_scaled(ct));
-		else
-			account_idle_ticks(delta_cpu);
-	}
-
-	/* Offlined for more than a few seconds? Avoid lockup warnings. */
-	if (stolen > 5*HZ)
-		touch_softlockup_watchdog();
-
-	/* Local timer processing (see update_process_times()). */
-	run_local_timers();
-	rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs()));
-	printk_tick();
-	scheduler_tick();
-	run_posix_cpu_timers(current);
-	profile_tick(CPU_PROFILING);
-
-	return IRQ_HANDLED;
-}
-
 void mark_tsc_unstable(char *reason)
 {
 #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
@@ -569,24 +433,13 @@ void mark_tsc_unstable(char *reason)
 }
 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 
-static void init_missing_ticks_accounting(unsigned int cpu)
-{
-	struct vcpu_runstate_info *runstate = setup_runstate_area(cpu);
-
-	per_cpu(processed_blocked_time, cpu) =
-		runstate->time[RUNSTATE_blocked];
-	per_cpu(processed_stolen_time, cpu) =
-		runstate->time[RUNSTATE_runnable] +
-		runstate->time[RUNSTATE_offline];
-}
-
 static cycle_t cs_last;
 
 static cycle_t xen_clocksource_read(struct clocksource *cs)
 {
 #ifdef CONFIG_SMP
 	cycle_t last = get_64bit(&cs_last);
-	cycle_t ret = local_clock();
+	cycle_t ret = xen_local_clock();
 
 	if (unlikely((s64)(ret - last) < 0)) {
 		if (last - ret > permitted_clock_jitter
@@ -612,7 +465,7 @@ static cycle_t xen_clocksource_read(stru
 		last = cur;
 	}
 #else
-	return local_clock();
+	return xen_local_clock();
 #endif
 }
 
@@ -623,26 +476,13 @@ static void xen_clocksource_resume(struc
 
 	init_cpu_khz();
 
-	for_each_online_cpu(cpu) {
-		switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
-					   &xen_set_periodic_tick)) {
-		case 0:
-#if CONFIG_XEN_COMPAT <= 0x030004
-		case -ENOSYS:
-#endif
-			break;
-		default:
-			BUG();
-		}
+	for_each_online_cpu(cpu)
 		get_time_values_from_xen(cpu);
-		per_cpu(processed_system_time, cpu) =
-			per_cpu(shadow_time, 0).system_timestamp;
-		init_missing_ticks_accounting(cpu);
-	}
 
-	processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
+	jiffies_bias = get_jiffies_64();
+	system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
 
-	cs_last = local_clock();
+	cs_last = xen_local_clock();
 }
 
 static struct clocksource clocksource_xen = {
@@ -656,7 +496,7 @@ static struct clocksource clocksource_xe
 	.resume			= xen_clocksource_resume,
 };
 
-struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu)
+void setup_runstate_area(unsigned int cpu)
 {
 	struct vcpu_register_runstate_memory_area area;
 	struct vcpu_runstate_info *rs = &per_cpu(runstate, cpu);
@@ -669,28 +509,12 @@ struct vcpu_runstate_info *setup_runstat
 		memset(rs, 0, sizeof(*rs));
 		WARN_ON(rc != -ENOSYS);
 	}
-
-	return rs;
-}
-
-/* Dynamically-mapped IRQ. */
-static int __read_mostly timer_irq = -1;
-static struct irqaction timer_action = {
-	.handler = timer_interrupt,
-	.flags   = IRQF_DISABLED|IRQF_TIMER,
-	.name    = "timer"
-};
-
-static void setup_cpu0_timer_irq(void)
-{
-	timer_irq = bind_virq_to_irqaction(VIRQ_TIMER, 0, &timer_action);
-	BUG_ON(timer_irq < 0);
 }
 
 static void __init _late_time_init(void)
 {
 	update_wallclock(false);
-	setup_cpu0_timer_irq();
+	xen_clockevents_init();
 }
 
 void __init time_init(void)
@@ -699,22 +523,11 @@ void __init time_init(void)
 	printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
 	       cpu_khz / 1000, cpu_khz % 1000);
 
-	switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0,
-				   &xen_set_periodic_tick)) {
-	case 0:
-#if CONFIG_XEN_COMPAT <= 0x030004
-	case -ENOSYS:
-#endif
-		break;
-	default:
-		BUG();
-	}
-
+	setup_runstate_area(0);
 	get_time_values_from_xen(0);
 
-	processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
-	per_cpu(processed_system_time, 0) = processed_system_time;
-	init_missing_ticks_accounting(0);
+	jiffies_bias     = jiffies_64;
+	system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
 
 	clocksource_register_hz(&clocksource_xen, NSEC_PER_SEC);
 
@@ -730,146 +543,23 @@ void __init time_init(void)
 /* Convert jiffies to system time. */
 u64 jiffies_to_st(unsigned long j)
 {
-	unsigned long seq;
-	long delta;
-	u64 st;
+	u64 j64 = get_jiffies_64();
+	long delta = j - (unsigned long)j64;
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		delta = j - jiffies;
-		if (delta < 1) {
-			/* Triggers in some wrap-around cases, but that's okay:
-			 * we just end up with a shorter timeout. */
-			st = processed_system_time + NS_PER_TICK;
-		} else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
-			/* Very long timeout means there is no pending timer.
-			 * We indicate this to Xen by passing zero timeout. */
-			st = 0;
-		} else {
-			st = processed_system_time + delta * (u64)NS_PER_TICK;
-		}
-	} while (read_seqretry(&xtime_lock, seq));
+	if (delta < 1)
+		/* Triggers in some wrap-around cases, but that's okay:
+		 * we just end up with a shorter timeout. */
+		return processed_system_time(j64) + NS_PER_TICK;
+
+	if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0)
+		/* Very long timeout means there is no pending timer.
+		 * We indicate this to Xen by passing zero timeout. */
+		return 0;
 
-	return st;
+	return processed_system_time(j64) + delta * (u64)NS_PER_TICK;
 }
 EXPORT_SYMBOL(jiffies_to_st);
 
-/*
- * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
- * These functions are based on implementations from arch/s390/kernel/time.c
- */
-static void stop_hz_timer(void)
-{
-	struct vcpu_set_singleshot_timer singleshot;
-	unsigned int cpu = smp_processor_id();
-	unsigned long j;
-	int rc;
-
-	cpumask_set_cpu(cpu, nohz_cpu_mask);
-
-	/* See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs  */
-	/* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a  */
-	/* value of rcp->cur that matches rdp->quiescbatch and allows us to  */
-	/* stop the hz timer then the cpumasks created for subsequent values */
-	/* of cur in rcu_start_batch are guaranteed to pick up the updated   */
-	/* nohz_cpu_mask and so will not depend on this cpu.                 */
-
-	smp_mb();
-
-	/* Leave ourselves in tick mode if rcu or softirq or timer pending. */
-	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-	    local_softirq_pending() ||
-	    (j = get_next_timer_interrupt(jiffies),
-	     time_before_eq(j, jiffies))) {
-		cpumask_clear_cpu(cpu, nohz_cpu_mask);
-		j = jiffies + 1;
-	}
-
-	singleshot.timeout_abs_ns = jiffies_to_st(j);
-	if (!singleshot.timeout_abs_ns)
-		return;
-	singleshot.timeout_abs_ns += NS_PER_TICK / 2;
-	singleshot.flags = 0;
-	rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot);
-#if CONFIG_XEN_COMPAT <= 0x030004
-	if (rc) {
-		BUG_ON(rc != -ENOSYS);
-		rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns);
-	}
-#endif
-	BUG_ON(rc);
-}
-
-static void start_hz_timer(void)
-{
-	unsigned int cpu = smp_processor_id();
-	int rc = HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL);
-
-#if CONFIG_XEN_COMPAT <= 0x030004
-	if (rc) {
-		BUG_ON(rc != -ENOSYS);
-		rc = HYPERVISOR_set_timer_op(0);
-	}
-#endif
-	BUG_ON(rc);
-	cpumask_clear_cpu(cpu, nohz_cpu_mask);
-}
-
-void xen_safe_halt(void)
-{
-	stop_hz_timer();
-	/* Blocking includes an implicit local_irq_enable(). */
-	HYPERVISOR_block();
-	start_hz_timer();
-}
-
-void xen_halt(void)
-{
-	if (irqs_disabled())
-		VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
-}
-
-#ifdef CONFIG_SMP
-int __cpuinit local_setup_timer(unsigned int cpu)
-{
-	int seq, irq;
-
-	BUG_ON(cpu == 0);
-
-	switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
-			   &xen_set_periodic_tick)) {
-	case 0:
-#if CONFIG_XEN_COMPAT <= 0x030004
-	case -ENOSYS:
-#endif
-		break;
-	default:
-		BUG();
-	}
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		/* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
-		per_cpu(processed_system_time, cpu) =
-			per_cpu(shadow_time, 0).system_timestamp;
-		init_missing_ticks_accounting(cpu);
-	} while (read_seqretry(&xtime_lock, seq));
-
-	irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action);
-	if (irq < 0)
-		return irq;
-	BUG_ON(timer_irq != irq);
-
-	return 0;
-}
-
-void __cpuinit local_teardown_timer(unsigned int cpu)
-{
-	BUG_ON(cpu == 0);
-	unbind_from_per_cpu_irq(timer_irq, cpu, &timer_action);
-}
-#endif
-
 #ifdef CONFIG_CPU_FREQ
 static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 
 				void *data)
--- head.orig/drivers/xen/Kconfig	2012-02-17 14:37:01.000000000 +0100
+++ head/drivers/xen/Kconfig	2012-02-17 14:37:09.000000000 +0100
@@ -286,9 +286,6 @@ endmenu
 config HAVE_IRQ_IGNORE_UNHANDLED
 	def_bool y
 
-config NO_IDLE_HZ
-	def_bool y
-
 config ARCH_HAS_WALK_MEMORY
 	def_bool y
 	depends on X86
--- head.orig/drivers/xen/core/Makefile	2012-02-17 14:35:00.000000000 +0100
+++ head/drivers/xen/core/Makefile	2012-02-17 14:37:13.000000000 +0100
@@ -10,3 +10,4 @@ obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug
 obj-$(CONFIG_XEN_SMPBOOT)	+= smpboot.o
 obj-$(CONFIG_SMP)		+= spinlock.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head/drivers/xen/core/clockevents.c	2011-12-23 11:43:31.000000000 +0100
@@ -0,0 +1,293 @@
+/*
+ *	Xen clockevent functions
+ *
+ *	See arch/x86/xen/time.c for copyright and credits for derived
+ *	portions of this file.
+ *
+ * Xen clockevent implementation
+ *
+ * Xen has two clockevent implementations:
+ *
+ * The old timer_op one works with all released versions of Xen prior
+ * to version 3.0.4.  This version of the hypervisor provides a
+ * single-shot timer with nanosecond resolution.  However, sharing the
+ * same event channel is a 100Hz tick which is delivered while the
+ * vcpu is running.  We don't care about or use this tick, but it will
+ * cause the core time code to think the timer fired too soon, and
+ * will end up resetting it each time.  It could be filtered, but
+ * doing so has complications when the ktime clocksource is not yet
+ * the xen clocksource (ie, at boot time).
+ *
+ * The new vcpu_op-based timer interface allows the tick timer period
+ * to be changed or turned off.  The tick timer is not useful as a
+ * periodic timer because events are only delivered to running vcpus.
+ * The one-shot timer can report when a timeout is in the past, so
+ * set_next_event is capable of returning -ETIME when appropriate.
+ * This interface is used when available.
+ */
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/math64.h>
+#include <asm/hypervisor.h>
+#include <xen/clock.h>
+#include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
+
+#define XEN_SHIFT 22
+
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP	100000
+#define NS_PER_TICK	(1000000000LL / HZ)
+
+/*
+ * Get a hypervisor absolute time.  In theory we could maintain an
+ * offset between the kernel's time and the hypervisor's time, and
+ * apply that to a kernel's absolute timeout.  Unfortunately the
+ * hypervisor and kernel times can drift even if the kernel is using
+ * the Xen clocksource, because ntp can warp the kernel's clocksource.
+ */
+static u64 get_abs_timeout(unsigned long delta)
+{
+	return xen_local_clock() + delta;
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030004
+static void timerop_set_mode(enum clock_event_mode mode,
+			     struct clock_event_device *evt)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		WARN_ON(1); /* unsupported */
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		if (HYPERVISOR_set_timer_op(0)) /* cancel timeout */
+			BUG();
+		break;
+	}
+}
+
+static int timerop_set_next_event(unsigned long delta,
+				  struct clock_event_device *evt)
+{
+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+		BUG();
+
+	/*
+	 * We may have missed the deadline, but there's no real way of
+	 * knowing for sure.  If the event was in the past, then we'll
+	 * get an immediate interrupt.
+	 */
+
+	return 0;
+}
+#endif
+
+static void vcpuop_set_mode(enum clock_event_mode mode,
+			    struct clock_event_device *evt)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		WARN_ON(1); /* unsupported */
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer,
+				       smp_processor_id(), NULL))
+			BUG();
+		/* fall through */
+	case CLOCK_EVT_MODE_ONESHOT:
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
+				       smp_processor_id(), NULL))
+			BUG();
+		break;
+
+	case CLOCK_EVT_MODE_RESUME:
+		break;
+	}
+}
+
+static int vcpuop_set_next_event(unsigned long delta,
+				 struct clock_event_device *evt)
+{
+	struct vcpu_set_singleshot_timer single;
+	int ret;
+
+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	single.timeout_abs_ns = get_abs_timeout(delta);
+	single.flags = VCPU_SSHOTTMR_future;
+
+	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer,
+				 smp_processor_id(), &single);
+
+	BUG_ON(ret != 0 && ret != -ETIME);
+
+	return ret;
+}
+
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_event) = {
+	.name		= "xen",
+	.features	= CLOCK_EVT_FEAT_ONESHOT,
+
+	.max_delta_ns	= 0xffffffff,
+	.min_delta_ns	= TIMER_SLOP,
+
+	.mult		= 1,
+	.shift		= 0,
+	.rating		= 500,
+
+	.irq		= -1,
+};
+
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(u64, runnable_snapshot);
+static DEFINE_PER_CPU(u64, offline_snapshot);
+
+/* unused ns of stolen time */
+static DEFINE_PER_CPU(unsigned int, residual_stolen);
+
+static void init_missing_ticks_accounting(unsigned int cpu)
+{
+	setup_runstate_area(cpu);
+	if (cpu == smp_processor_id()) {
+		this_cpu_write(runnable_snapshot,
+			       this_vcpu_read(runstate.time[RUNSTATE_runnable]));
+		this_cpu_write(offline_snapshot,
+			       this_vcpu_read(runstate.time[RUNSTATE_offline]));
+	}
+	per_cpu(residual_stolen, cpu) = 0;
+}
+
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+	struct clock_event_device *evt = &__get_cpu_var(xen_clock_event);
+	u64 runnable, offline;
+	s64 stolen;
+	irqreturn_t ret = IRQ_NONE;
+
+	if (evt->event_handler) {
+		evt->event_handler(evt);
+		ret = IRQ_HANDLED;
+	}
+
+	xen_check_wallclock_update();
+
+	runnable = this_vcpu_read(runstate.time[RUNSTATE_runnable]);
+	offline = this_vcpu_read(runstate.time[RUNSTATE_offline]);
+
+	stolen = runnable - __this_cpu_read(runnable_snapshot)
+		 + offline - __this_cpu_read(offline_snapshot)
+		 + __this_cpu_read(residual_stolen);
+
+	if (stolen >= NS_PER_TICK)
+		account_steal_ticks(div_u64_rem(stolen, NS_PER_TICK,
+						&__get_cpu_var(residual_stolen)));
+	else
+		__this_cpu_write(residual_stolen, stolen > 0 ? stolen : 0);
+
+	__this_cpu_write(runnable_snapshot, runnable);
+	__this_cpu_write(offline_snapshot, offline);
+
+	return ret;
+}
+
+static struct irqaction timer_action = {
+	.handler = timer_interrupt,
+	.flags   = IRQF_DISABLED|IRQF_TIMER,
+	.name    = "timer"
+};
+
+void __cpuinit xen_setup_cpu_clockevents(void)
+{
+	unsigned int cpu = smp_processor_id();
+	struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
+
+	init_missing_ticks_accounting(cpu);
+
+	evt->cpumask = cpumask_of(cpu);
+	clockevents_register_device(evt);
+}
+
+#ifdef CONFIG_SMP
+int __cpuinit local_setup_timer(unsigned int cpu)
+{
+	struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
+
+	BUG_ON(cpu == smp_processor_id());
+
+	evt->irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action);
+	if (evt->irq < 0)
+		return evt->irq;
+	BUG_ON(per_cpu(xen_clock_event.irq, 0) != evt->irq);
+
+	evt->set_mode = this_cpu_read(xen_clock_event.set_mode);
+	evt->set_next_event = this_cpu_read(xen_clock_event.set_next_event);
+
+	return 0;
+}
+
+void __cpuinit local_teardown_timer(unsigned int cpu)
+{
+	struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
+
+	BUG_ON(cpu == 0);
+	unbind_from_per_cpu_irq(evt->irq, cpu, &timer_action);
+}
+#endif
+
+void xen_clockevents_resume(void)
+{
+	unsigned int cpu;
+
+	if (__this_cpu_read(xen_clock_event.set_mode) != vcpuop_set_mode)
+		return;
+
+	for_each_online_cpu(cpu) {
+		init_missing_ticks_accounting(cpu);
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+			BUG();
+	}
+}
+
+void __init xen_clockevents_init(void)
+{
+	unsigned int cpu = smp_processor_id();
+	struct clock_event_device *evt = &__get_cpu_var(xen_clock_event);
+
+	switch (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
+				   cpu, NULL)) {
+	case 0:
+		/*
+		 * Successfully turned off 100Hz tick, so we have the
+		 * vcpuop-based timer interface
+		 */
+		evt->set_mode = vcpuop_set_mode;
+		evt->set_next_event = vcpuop_set_next_event;
+		break;
+#if CONFIG_XEN_COMPAT <= 0x030004
+	case -ENOSYS:
+		printk(KERN_DEBUG "Xen: using timerop interface\n");
+		evt->set_mode = timerop_set_mode;
+		evt->set_next_event = timerop_set_next_event;
+		break;
+#endif
+	default:
+		BUG();
+	}
+
+	evt->irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action);
+	BUG_ON(evt->irq < 0);
+
+	xen_setup_cpu_clockevents();
+}
--- head.orig/drivers/xen/core/evtchn.c	2012-04-03 17:09:14.000000000 +0200
+++ head/drivers/xen/core/evtchn.c	2012-04-03 17:09:23.000000000 +0200
@@ -389,6 +389,7 @@ asmlinkage void __irq_entry evtchn_do_up
 		wmb();
 #endif
 
+#ifndef CONFIG_NO_HZ
 		/*
 		 * Handle timer interrupts before all others, so that all
 		 * hardirq handlers see an up-to-date system time even if we
@@ -414,6 +415,7 @@ asmlinkage void __irq_entry evtchn_do_up
 					BUG();
 			}
 		}
+#endif /* CONFIG_NO_HZ */
 
 		l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
 
--- head.orig/drivers/xen/core/machine_reboot.c	2012-04-11 16:09:22.000000000 +0200
+++ head/drivers/xen/core/machine_reboot.c	2011-11-18 16:11:15.000000000 +0100
@@ -13,6 +13,7 @@
 #include <asm/hypervisor.h>
 #include <xen/xenbus.h>
 #include <linux/cpu.h>
+#include <xen/clock.h>
 #include <xen/gnttab.h>
 #include <xen/xencons.h>
 #include <xen/cpu_hotplug.h>
@@ -158,10 +159,12 @@ static int take_machine_down(void *_susp
 	} else
 		BUG_ON(suspend_cancelled > 0);
 	suspend->resume_notifier(suspend_cancelled);
-	if (suspend_cancelled >= 0) {
+	if (suspend_cancelled >= 0)
 		post_suspend(suspend_cancelled);
+	if (!suspend_cancelled)
+		xen_clockevents_resume();
+	if (suspend_cancelled >= 0)
 		syscore_resume();
-	}
 	if (!suspend_cancelled) {
 #ifdef __x86_64__
 		/*
--- head.orig/drivers/xen/core/smpboot.c	2012-03-22 16:23:53.000000000 +0100
+++ head/drivers/xen/core/smpboot.c	2012-03-22 16:24:55.000000000 +0100
@@ -16,6 +16,7 @@
 #include <linux/percpu.h>
 #include <asm/desc.h>
 #include <asm/pgalloc.h>
+#include <xen/clock.h>
 #include <xen/evtchn.h>
 #include <xen/interface/vcpu.h>
 #include <xen/cpu_hotplug.h>
@@ -144,6 +145,7 @@ static void __cpuinit cpu_bringup(void)
 	identify_secondary_cpu(__this_cpu_ptr(&cpu_info));
 	touch_softlockup_watchdog();
 	preempt_disable();
+	xen_setup_cpu_clockevents();
 	cpu = smp_processor_id();
 	notify_cpu_starting(cpu);
 	ipi_call_lock_irq();
--- head.orig/drivers/xen/core/spinlock.c	2012-02-01 09:18:11.000000000 +0100
+++ head/drivers/xen/core/spinlock.c	2012-02-01 09:18:36.000000000 +0100
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <asm/hardirq.h>
+#include <xen/clock.h>
 #include <xen/evtchn.h>
 
 struct spinning {
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head/include/xen/clock.h	2011-12-23 11:30:11.000000000 +0100
@@ -0,0 +1,18 @@
+#ifndef __XEN_CPU_CLOCK_H__
+#define __XEN_CPU_CLOCK_H__
+
+void setup_runstate_area(unsigned int cpu);
+
+unsigned long long xen_local_clock(void);
+void xen_check_wallclock_update(void);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+void xen_clockevents_init(void);
+void xen_setup_cpu_clockevents(void);
+void xen_clockevents_resume(void);
+#else
+static inline void xen_setup_cpu_clockevents(void) {}
+static inline void xen_clockevents_resume(void) {}
+#endif
+
+#endif /* __XEN_CPU_CLOCK_H__ */
--- head.orig/kernel/sched/core.c	2012-04-10 16:56:46.000000000 +0200
+++ head/kernel/sched/core.c	2012-04-11 17:02:37.000000000 +0200
@@ -2656,6 +2656,48 @@ static inline void task_group_account_fi
 }
 
 
+#if !defined(CONFIG_XEN) || defined(CONFIG_VIRT_CPU_ACCOUNTING)
+# define cputime_to_u64(t) ((__force u64)(t))
+#else
+# include <linux/syscore_ops.h>
+# define NS_PER_TICK (1000000000 / HZ)
+
+static DEFINE_PER_CPU(u64, steal_snapshot);
+static DEFINE_PER_CPU(unsigned int, steal_residual);
+
+static u64 cputime_to_u64(cputime_t t)
+{
+	u64 s = this_vcpu_read(runstate.time[RUNSTATE_runnable]);
+	unsigned long adj = div_u64_rem(s - __this_cpu_read(steal_snapshot)
+					  + __this_cpu_read(steal_residual),
+					NS_PER_TICK,
+					&__get_cpu_var(steal_residual));
+
+	__this_cpu_write(steal_snapshot, s);
+	if (t < jiffies_to_cputime(adj))
+		return 0;
+
+	return (__force u64)(t - jiffies_to_cputime(adj));
+}
+
+static void steal_resume(void)
+{
+	cputime_to_u64(((cputime_t)1 << (BITS_PER_LONG * sizeof(cputime_t)
+					 / sizeof(long) - 1)) - 1);
+}
+
+static struct syscore_ops steal_syscore_ops = {
+	.resume	= steal_resume,
+};
+
+static int __init steal_register(void)
+{
+	register_syscore_ops(&steal_syscore_ops);
+	return 0;
+}
+core_initcall(steal_register);
+#endif
+
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
@@ -2675,7 +2717,7 @@ void account_user_time(struct task_struc
 	index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 
 	/* Add user time to cpustat. */
-	task_group_account_field(p, index, (__force u64) cputime);
+	task_group_account_field(p, index, cputime_to_u64(cputime));
 
 	/* Account for user time used */
 	acct_update_integrals(p);
@@ -2725,7 +2767,7 @@ void __account_system_time(struct task_s
 	account_group_system_time(p, cputime);
 
 	/* Add system time to cpustat. */
-	task_group_account_field(p, index, (__force u64) cputime);
+	task_group_account_field(p, index, cputime_to_u64(cputime));
 
 	/* Account for system time used */
 	acct_update_integrals(p);
@@ -2779,9 +2821,9 @@ void account_idle_time(cputime_t cputime
 	struct rq *rq = this_rq();
 
 	if (atomic_read(&rq->nr_iowait) > 0)
-		cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+		cpustat[CPUTIME_IOWAIT] += cputime_to_u64(cputime);
 	else
-		cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+		cpustat[CPUTIME_IDLE] += cputime_to_u64(cputime);
 }
 
 static __always_inline bool steal_account_process_tick(void)
@@ -2837,9 +2879,9 @@ static void irqtime_account_process_tick
 		return;
 
 	if (irqtime_account_hi_update()) {
-		cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+		cpustat[CPUTIME_IRQ] += cputime_to_u64(cputime_one_jiffy);
 	} else if (irqtime_account_si_update()) {
-		cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+		cpustat[CPUTIME_SOFTIRQ] += cputime_to_u64(cputime_one_jiffy);
 	} else if (this_cpu_ksoftirqd() == p) {
 		/*
 		 * ksoftirqd time do not get accounted in cpu_softirq_time.