Index: linux/arch/x86_64/kernel/apic.c =================================================================== --- linux.orig/arch/x86_64/kernel/apic.c +++ linux/arch/x86_64/kernel/apic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -33,14 +34,19 @@ #include #include #include +#include +#include int apic_verbosity; +int apic_runs_main_timer; int disable_apic_timer __initdata; /* Using APIC to generate smp_local_timer_interrupt? */ int using_apic_timer = 0; +static unsigned int calibration_result; + static DEFINE_PER_CPU(int, prof_multiplier) = 1; static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; static DEFINE_PER_CPU(int, prof_counter) = 1; @@ -637,6 +643,69 @@ void __init init_apic_mappings(void) #endif } +#ifdef CONFIG_NO_IDLE_HZ + +extern int sysctl_hz_timer; + +/* Delay timer */ +static void apic_delay_timer(void) +{ + long offset; + long left = apic_read(APIC_TMCCT); + long left_now = left % calibration_result; + long ticks; + + if (!in_interrupt() && local_softirq_pending()) + offset = 0; + else if (rcu_pending(smp_processor_id())) + offset = 0; + else + offset = next_timer_interrupt() - jiffies - 1; + ticks = left_now + offset*calibration_result; + if (ticks >= 0xffffff) + ticks = left_now + + (((0xffffff - left_now)/calibration_result)*calibration_result); + if (ticks != left) + apic_write(APIC_TMCCT, ticks); +} + +/* Let timer fire on next tick again */ +static void apic_restart_timer(void) +{ + unsigned long left; + left = apic_read(APIC_TMCCT); + if (left < calibration_result) + return; + apic_write(APIC_TMCCT, left % calibration_result); +} + +static int apic_timer_start_stop(struct notifier_block *me, unsigned long cmd, + void *data) +{ + unsigned long flags; + + if (sysctl_hz_timer || + (smp_processor_id() == boot_cpu_id && !apic_runs_main_timer)) + return NOTIFY_DONE; + + local_irq_save(flags); + switch (cmd) { + case IDLE_START: + apic_delay_timer(); + break; + case IDLE_END: + apic_restart_timer(); + break; + } + local_irq_restore(flags); + return NOTIFY_DONE; +} + +static struct notifier_block apic_timer_notifier = { + .notifier_call = apic_timer_start_stop, +}; +#endif + /* * This function sets up the local APIC timer, with a timeout of * 'clocks' APIC bus clock. During calibration we actually call @@ -650,7 +719,12 @@ void __init init_apic_mappings(void) #define APIC_DIVISOR 16 -static void __setup_APIC_LVTT(unsigned int clocks) +void change_apic_timer(unsigned clocks) +{ + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); +} + +void __setup_APIC_LVTT(unsigned int clocks) { unsigned int lvtt_value, tmp_value, ver; @@ -667,8 +741,7 @@ static void __setup_APIC_LVTT(unsigned i apic_write_around(APIC_TDCR, (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | APIC_TDR_DIV_16); - - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); + change_apic_timer(clocks); } static void setup_APIC_timer(unsigned int clocks) @@ -677,25 +750,19 @@ static void setup_APIC_timer(unsigned in local_irq_save(flags); - /* For some reasons this doesn't work on Simics, so fake it for now */ - if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { - __setup_APIC_LVTT(clocks); - return; - } + printk("setup_APIC_timer\n"); /* wait for irq slice */ if (vxtime.hpet_address) { - int trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_COUNTER) >= trigger) - /* do nothing */ ; - while (hpet_readl(HPET_COUNTER) < trigger) - /* do nothing */ ; + int tick = hpet_readl(HPET_COUNTER) / hpet_tick; + while (hpet_readl(HPET_COUNTER)/hpet_tick == tick) + /* no cpu relax to keep latency down */; } else { int c1, c2; outb_p(0x00, 0x43); c2 = inb_p(0x40); c2 |= inb_p(0x40) << 8; - do { + do { c1 = c2; outb_p(0x00, 0x43); c2 = inb_p(0x40); @@ -703,9 +770,12 @@ static void setup_APIC_timer(unsigned in } while (c2 - c1 < 300); } - __setup_APIC_LVTT(clocks); - + if (apic_runs_main_timer) + stop_timer_interrupt(); + if (smp_processor_id() != boot_cpu_id) + __setup_APIC_LVTT(clocks); local_irq_restore(flags); + printk("done\n"); } /* @@ -750,8 +820,6 @@ static int __init calibrate_APIC_clock(v return result * APIC_DIVISOR / HZ; } -static unsigned int calibration_result; - void __init setup_boot_APIC_clock (void) { if (disable_apic_timer) { @@ -770,6 +838,9 @@ void __init setup_boot_APIC_clock (void) */ setup_APIC_timer(calibration_result); +#ifdef CONFIG_NO_IDLE_HZ + idle_notifier_register(&apic_timer_notifier); +#endif local_irq_enable(); } @@ -840,10 +911,8 @@ int setup_profiling_timer(unsigned int m * value into /proc/profile. */ -void smp_local_timer_interrupt(struct pt_regs *regs) +void apic_profile_tick(int cpu, struct pt_regs *regs) { - int cpu = smp_processor_id(); - profile_tick(CPU_PROFILING, regs); if (--per_cpu(prof_counter, cpu) <= 0) { /* @@ -857,16 +926,29 @@ void smp_local_timer_interrupt(struct pt per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); if (per_cpu(prof_counter, cpu) != per_cpu(prof_old_multiplier, cpu)) { - __setup_APIC_LVTT(calibration_result/ + change_apic_timer(calibration_result/ per_cpu(prof_counter, cpu)); per_cpu(prof_old_multiplier, cpu) = per_cpu(prof_counter, cpu); } -#ifdef CONFIG_SMP - update_process_times(user_mode(regs)); -#endif + if (cpu != boot_cpu_id) + update_process_times(user_mode(regs)); } +} + +void smp_local_timer_interrupt(struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + int idle = in_idle(); + + if (idle) + exit_idle(); + + if (apic_runs_main_timer && smp_processor_id() == boot_cpu_id) + main_timer_handler(regs); + else + apic_profile_tick(cpu, regs); /* * We take the 'long' return path, and there every subsystem @@ -890,11 +972,16 @@ void smp_local_timer_interrupt(struct pt */ void smp_apic_timer_interrupt(struct pt_regs *regs) { + int idle = in_idle(); + /* * the NMI deadlock-detector uses this. */ add_pda(apic_timer_irqs, 1); + if (idle) + exit_idle(); + /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. @@ -963,6 +1050,7 @@ __init int oem_force_hpet_timer(void) */ asmlinkage void smp_spurious_interrupt(void) { + int idle = in_idle(); unsigned int v; irq_enter(); /* @@ -989,6 +1077,9 @@ asmlinkage void smp_spurious_interrupt(v } #endif irq_exit(); + + if (idle) + exit_idle(); } /* @@ -1000,6 +1091,10 @@ asmlinkage void smp_error_interrupt(void unsigned int v, v1; irq_enter(); + + if (in_idle()) + exit_idle(); + /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); @@ -1019,6 +1114,7 @@ asmlinkage void smp_error_interrupt(void */ printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", smp_processor_id(), v , v1); + irq_exit(); } @@ -1078,6 +1174,13 @@ static __init int setup_noapictimer(char return 0; } +static __init int setup_apicmaintimer(char *str) +{ + apic_runs_main_timer = 1; + return 0; +} +__setup("apicmaintimer", setup_apicmaintimer); + /* dummy parsing: see setup.c */ __setup("disableapic", setup_disableapic); Index: linux/include/asm-x86_64/apic.h =================================================================== --- linux.orig/include/asm-x86_64/apic.h +++ linux/include/asm-x86_64/apic.h @@ -17,6 +17,7 @@ #define APIC_DEBUG 2 extern int apic_verbosity; +extern int apic_runs_main_timer; /* * Define the default level of output to be very little @@ -98,6 +99,7 @@ extern int APIC_init_uniprocessor (void) extern void disable_APIC_timer(void); extern void enable_APIC_timer(void); extern void clustered_apic_check(void); +extern void apic_profile_tick(int, struct pt_regs *); extern int check_nmi_watchdog(void); extern void nmi_watchdog_default(void); Index: linux/arch/x86_64/kernel/process.c =================================================================== --- linux.orig/arch/x86_64/kernel/process.c +++ linux/arch/x86_64/kernel/process.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -48,6 +49,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); @@ -78,6 +80,32 @@ void enable_hlt(void) EXPORT_SYMBOL(enable_hlt); +static struct notifier_block *idle_notifier; + +void idle_notifier_register(struct notifier_block *n) +{ + /* no locking for now, could be added later if modules + use this */ + notifier_chain_register(&idle_notifier, n); +} + +enum idle_state { CPU_IDLE, CPU_NOT_IDLE }; +static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; + +void enter_idle(void) +{ + if (cmpxchg(&__get_cpu_var(idle_state), CPU_NOT_IDLE, CPU_IDLE) == + CPU_NOT_IDLE) + notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +void exit_idle(void) +{ + if (cmpxchg(&__get_cpu_var(idle_state), CPU_IDLE, CPU_NOT_IDLE) == + CPU_IDLE) + notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + /* * We use this if we don't have any better * idle routine.. @@ -123,6 +151,7 @@ static void poll_idle (void) } else { set_need_resched(); } + exit_idle(); } void cpu_idle_wait(void) @@ -169,11 +198,16 @@ void cpu_idle (void) if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; + enter_idle(); rmb(); idle = pm_idle; if (!idle) idle = default_idle; idle(); + /* Normally exit_idle has been done by + the interrupt already, but not in MWAIT/POLL + It is safer to do it again. */ + exit_idle(); } schedule(); Index: linux/arch/x86_64/kernel/io_apic.c =================================================================== --- linux.orig/arch/x86_64/kernel/io_apic.c +++ linux/arch/x86_64/kernel/io_apic.c @@ -1318,6 +1318,11 @@ static void set_ioapic_affinity_irq(unsi unsigned long flags; unsigned int dest; + /* Don't allow to redirect timer from irq 0. + Should add error return */ + if (irq == 0) + return; + dest = cpu_mask_to_apicid(mask); /* Index: linux/kernel/sysctl.c =================================================================== --- linux.orig/kernel/sysctl.c +++ linux/kernel/sysctl.c @@ -551,6 +551,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#endif #ifdef CONFIG_NO_IDLE_HZ { .ctl_name = KERN_HZ_TIMER, @@ -561,6 +562,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_ARCH_S390 { .ctl_name = KERN_S390_USER_DEBUG_LOGGING, .procname = "userprocess_debug", Index: linux/arch/x86_64/kernel/time.c =================================================================== --- linux.orig/arch/x86_64/kernel/time.c +++ linux/arch/x86_64/kernel/time.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #ifdef CONFIG_X86_LOCAL_APIC @@ -49,7 +50,6 @@ EXPORT_SYMBOL(jiffies_64); static void cpufreq_delayed_get(void); #endif extern void i8254_timer_resume(void); -extern int using_apic_timer; DEFINE_SPINLOCK(rtc_lock); DEFINE_SPINLOCK(i8253_lock); @@ -58,6 +58,7 @@ static int nohpet __initdata = 0; static int notsc __initdata = 0; #undef HPET_HACK_ENABLE_DANGEROUS +#define HPET_REPROGRAM_SAFETY 20 /* 5% of hpet_tick */ unsigned int cpu_khz; /* TSC clocks / usec, not used here */ static unsigned long hpet_period; /* fsecs / HPET clock */ @@ -66,6 +67,12 @@ unsigned long vxtime_hz = PIT_TICK_RATE; int report_lost_ticks; /* command line option */ unsigned long long monotonic_base; +#ifdef CONFIG_NO_IDLE_HZ +static int hpet_periodic = 0; +#else +static int hpet_periodic = 1; +#endif + struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; @@ -73,13 +80,14 @@ unsigned long __wall_jiffies __section_w struct timespec __xtime __section_xtime; struct timezone __sys_tz __section_sys_tz; -static inline void rdtscll_sync(unsigned long *tsc) -{ -#ifdef CONFIG_SMP - sync_core(); -#endif - rdtscll(*tsc); -} +static int __sync_timer(struct pt_regs *regs); + +int xprintk = 0; +int xprintk_count = 200; +#define Xprintk(fmt...) do { if (xprintk && xprintk_count > 0) \ + printk(fmt), xprintk_count--; } while(0) +static int enable_xprintk(char *s) { xprintk = 1; return 0; } +__setup("xprintk",enable_xprintk); /* * do_gettimeoffset() returns microseconds since last timer interrupt was @@ -91,16 +99,24 @@ static inline void rdtscll_sync(unsigned * together by xtime_lock. */ -static inline unsigned int do_gettimeoffset_tsc(void) +static inline unsigned int __do_gettimeoffset_tsc(void) { unsigned long t; unsigned long x; - rdtscll_sync(&t); + rdtscll(t); if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; return x; } +static inline unsigned int do_gettimeoffset_tsc(void) +{ +#ifdef CONFIG_SMP + sync_core(); +#endif + return __do_gettimeoffset_tsc(); +} + static inline unsigned int do_gettimeoffset_hpet(void) { return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32; @@ -299,8 +315,7 @@ unsigned long long monotonic_clock(void) last_offset = vxtime.last; base = monotonic_base; - this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - + this_offset = hpet_readl(HPET_COUNTER); } while (read_seqretry(&xtime_lock, seq)); offset = (this_offset - last_offset); offset *=(NSEC_PER_SEC/HZ)/hpet_tick; @@ -327,6 +342,10 @@ static noinline void handle_lost_ticks(i static long lost_count; static int warned; +#ifdef CONFIG_NO_IDLE_HZ + return; +#endif + if (report_lost_ticks) { printk(KERN_WARNING "time.c: Lost %d timer " "tick(s)! ", lost); @@ -360,11 +379,14 @@ static noinline void handle_lost_ticks(i #endif } -static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +unsigned pit_ch0_init = LATCH; + +int main_timer_handler(struct pt_regs *regs) { static unsigned long rtc_update = 0; unsigned long tsc; - int delay, offset = 0, lost = 0; + int delay, offset = 0, lost = 0, count = 0; + int i; /* * Here we are in the timer irq handler. We have irqs locally disabled (so we @@ -375,19 +397,26 @@ static irqreturn_t timer_interrupt(int i write_seqlock(&xtime_lock); +again: if (vxtime.hpet_address) { - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - delay = hpet_readl(HPET_COUNTER) - offset; + /* RED-PEN need to handle wrap interrupt and retry sanely. */ + offset = hpet_readl(HPET_T0_CMP); + if (hpet_periodic) + offset -= hpet_tick; + count = hpet_readl(HPET_COUNTER); + delay = count - offset; + Xprintk("timer count %u(%u) offset %lu delay %lu\n", + count, count/hpet_tick, offset, delay); } else { spin_lock(&i8253_lock); outb_p(0x00, 0x43); delay = inb_p(0x40); delay |= inb(0x40) << 8; spin_unlock(&i8253_lock); - delay = LATCH - 1 - delay; + delay = pit_ch0_init - 1 - delay; } - rdtscll_sync(&tsc); + rdtscll(tsc); if (vxtime.mode == VXTIME_HPET) { if (offset - vxtime.last > hpet_tick) { @@ -424,34 +453,34 @@ static irqreturn_t timer_interrupt(int i (((long) offset << 32) / vxtime.tsc_quot) - 1; } - if (lost > 0) { - handle_lost_ticks(lost, regs); - jiffies += lost; + for (i = 0; i <= lost; i++) { + update_process_times(regs ? user_mode(regs) : 0); + do_timer(regs); + } + + if (vxtime.hpet_address && !hpet_periodic) { + int to_next = hpet_tick - count%hpet_tick; + int next = count + to_next; + hpet_writel(next, HPET_T0_CMP); + Xprintk("count %u next +%d(%u.%u)\n", count, next - count, + (next-count)/hpet_tick, (next-count)%hpet_tick); + /* Paranoia: make sure to not underrun cmp */ + if (to_next < hpet_tick / HPET_REPROGRAM_SAFETY) { + if (hpet_readl(HPET_COUNTER) >= next) { + printk("hpet underrun!\n"); + goto again; + } + } + } + + /* On the boot CPU we do the job of the APIC timer interrupt + too */ + if (regs && (!using_apic_timer || smp_processor_id() == boot_cpu_id)) { + add_pda(apic_timer_irqs, 1); + apic_profile_tick(smp_processor_id(), regs); } /* - * Do the timer stuff. - */ - - do_timer(regs); -#ifndef CONFIG_SMP - update_process_times(user_mode(regs)); -#endif - -/* - * In the SMP case we use the local APIC timer interrupt to do the profiling, - * except when we simulate SMP mode on a uniprocessor system, in that case we - * have to call the local interrupt handler. - */ - -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else - if (!using_apic_timer) - smp_local_timer_interrupt(regs); -#endif - -/* * If we have an externally synchronized Linux clock, then update CMOS clock * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy * closest to exactly 500 ms before the next second. If the update fails, we @@ -467,9 +496,24 @@ static irqreturn_t timer_interrupt(int i write_sequnlock(&xtime_lock); + if (lost > 0) + handle_lost_ticks(lost, regs); + return IRQ_HANDLED; } +#ifndef CONFIG_NO_IDLE_HZ +static inline int __sync_timer(struct pt_regs *regs) +{ + return main_timer_handler(regs); +} +#endif + +static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + return __sync_timer(regs); +} + static unsigned int cyc2ns_scale; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ @@ -785,7 +829,7 @@ fs_initcall(late_hpet_init); static int hpet_timer_stop_set_go(unsigned long tick) { - unsigned int cfg; + unsigned int cfg, t0cfg; /* * Stop the timers and reset the main counter. @@ -797,15 +841,20 @@ static int hpet_timer_stop_set_go(unsign hpet_writel(0, HPET_COUNTER); hpet_writel(0, HPET_COUNTER + 4); + if (tick == 0) + return 0; + /* - * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, - * and period also hpet_tick. + * Set up timer 0 with first interrupt to happen at hpet_tick. */ - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | - HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); - hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ + t0cfg = HPET_TN_ENABLE | HPET_TN_32BIT; + if (hpet_periodic) + t0cfg |= HPET_TN_PERIODIC | HPET_TN_SETVAL; + hpet_writel(t0cfg, HPET_T0_CFG); + if (hpet_periodic) + hpet_writel(hpet_tick, HPET_T0_CMP); /* write accumulator */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* write first comparator */ /* * Go! @@ -851,17 +900,338 @@ static int hpet_reenable(void) return hpet_timer_stop_set_go(hpet_tick); } -void __init pit_init(void) +#define PIT_MODE 0x43 +#define PIT_CH0 0x40 + +static void __init pit_stop_interrupt(void) { unsigned long flags; - spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34, 0x43); /* binary, mode 2, LSB/MSB, ch 0 */ - outb_p(LATCH & 0xff, 0x40); /* LSB */ - outb_p(LATCH >> 8, 0x40); /* MSB */ + + /* Set rate generator mode without interrupt */ + outb_p((2 << 1) | (3 << 4), PIT_MODE); + outb_p(1, PIT_CH0); + outb(0, PIT_CH0); spin_unlock_irqrestore(&i8253_lock, flags); } +void __init stop_timer_interrupt(void) +{ + if (vxtime.hpet_address) + hpet_timer_stop_set_go(0); + else + pit_stop_interrupt(); + printk("------- timer interrupt stopped\n"); +} + +/* + * Reprograms the next timer interrupt + * PIT timer reprogramming code taken from APM code. + * [and taken here from T.Lindgren's patch] + * Note that PIT timer is a 16-bit timer, which allows max + * skip of only few seconds. + * Must run with interrupts off + */ +void reprogram_pit_timer(int skip) +{ + spin_lock(&i8253_lock); + outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(skip & 0xff, PIT_CH0); /* LSB */ + outb(skip >> 8, PIT_CH0); /* MSB */ + pit_ch0_init = skip; + spin_unlock(&i8253_lock); +} + +void __init pit_init(void) +{ + unsigned long flags; + local_irq_save(flags); + reprogram_pit_timer(LATCH); + local_irq_restore(flags); +} + +#ifdef CONFIG_NO_IDLE_HZ +int sysctl_hz_timer = 0; + +static __init int setup_hztimer(char *s) +{ + sysctl_hz_timer = 1; + return 0; +} +__setup("hztimer", setup_hztimer); + +enum timer_state { + TIMER_SKIPPING, + TIMER_NORMAL +}; +/* RED-PEN is it still normal when per cpu data is cloned? CHECKME */ +static DEFINE_PER_CPU(enum timer_state, timer_state) = TIMER_NORMAL; +static DEFINE_PER_CPU(unsigned long, jiffies_target); + +/* Alternatively could read the PIT here, but that would be much slower. */ +static int pit_left_in_jiffie(void) +{ + int left = 1000000/HZ - __do_gettimeoffset_tsc(); + if (left < 0) /* Lost a timer tick. Should not happen */ + left = 0; + return left / PIT_TICK_RATE; +} + +static noinline unsigned long timer_target(void) +{ + /* not correct - need a new mechanism to tell RCU about idle CPUs */ + if (rcu_pending(smp_processor_id())) + return jiffies + 1; + /* Expensive. It would be better if add_timer told us + when something changed. */ + return next_timer_interrupt(); +} + +/* Optimize away unnecessary timer state changes. These timers are off CPU + and always hundreds or even thousands of cycles away. So it is + better to touch them as rarely as possible. */ + +static int delay_needed(unsigned long target, unsigned long *old_target) +{ + if (__get_cpu_var(timer_state) == TIMER_SKIPPING) + *old_target = __get_cpu_var(jiffies_target); + else + *old_target = jiffies + 1; + if (target == *old_target) + return 0; + __get_cpu_var(jiffies_target) = target; + __get_cpu_var(timer_state) = TIMER_SKIPPING; + return 1; +} + +static noinline int timer_regular(void) +{ + if (__get_cpu_var(timer_state) == TIMER_NORMAL) + return 1; + __get_cpu_var(timer_state) = TIMER_NORMAL; + if (__get_cpu_var(jiffies_target) == jiffies+1) + return 1; + return 0; +} + +/* + * Delay PIT timer. + * Runs from each interrupt on Boot CPU. + * Runs with interrupts off. + * + * The regular reprogramming of the PIT with data from the TSC timer + * is unfortunate and probably will add regular errors and result in + * much more drift because we still use the PIT data for main time keeping. + * It's really bad that we cannot rely on the presence of the HPET timer + * here due some chipsets just not implementing it :/ + * + * In theory we could switch main time keeping over to PM in this + * case which should run independently. The problem is that most PM + * timers are only 24bit and they wrap every 4s, but that may be acceptable + * because our delay times should be usually less. Double check this. + */ + +static noinline void pit_delay_timer(void) +{ + int offset; + long target = timer_target(); + unsigned long old_target; + + if (!delay_needed(target, &old_target)) + return; + + offset = (int)(target - jiffies - 1)*LATCH + pit_left_in_jiffie(); + if (offset >= 0xffff) + offset = (0xfffe / LATCH) * LATCH; + + Xprintk("pit_delay_timer offset %u cur %lu target %lu skipping %lu\n", + offset, jiffies, target, target - jiffies); + + reprogram_pit_timer(offset); +} + +static void pit_restart_timer(void) +{ + if (timer_regular()) + return; + + reprogram_pit_timer(pit_left_in_jiffie()); +} + +/* + * Check if we lost enough time that we need to update jiffies or + * xtime again. + * + * The subtle point of this is that it results in the main timer handler + * being called on all CPUs, not only on the boot CPU. We do some locking + * to make sure they don't do it more often than once per jiffie. + * + * Weak point is the reliance on the TSC when their frequencies vary + * with powernow and which may stop in C3. + */ +static int __sync_timer(struct pt_regs *regs) +{ + static DEFINE_SPINLOCK(sync_timer_lock); + long offset; + + Xprintk("sync_timer jiffies %lu count %lu\n", jiffies, + vxtime.hpet_address ? hpet_readl(HPET_COUNTER) : 0); + + offset = __do_gettimeoffset_tsc(); + if (offset >= 1000000/HZ) { + spin_lock(&sync_timer_lock); +#ifdef CONFIG_SMP + /* Did another CPU have the same idea and got the lock + earlier? */ + if (__do_gettimeoffset_tsc() < 1000000/HZ) { + spin_unlock(&sync_timer_lock); + return IRQ_HANDLED; + } +#endif + main_timer_handler(regs); + spin_unlock(&sync_timer_lock); + } + return IRQ_HANDLED; +} + +static void sync_timer(void) +{ + if (__get_cpu_var(timer_state) != TIMER_SKIPPING) + return; + __sync_timer(NULL); +} + +/* HPET timer delay support */ + +static noinline void hpet_delay_timer(void) +{ + unsigned long offset; + unsigned long target = timer_target(); + unsigned long old_target; + unsigned cmp, count, newcmp; + int left_now; + + if (hpet_periodic || !delay_needed(target, &old_target)) + return; + + /* + * Get current state of HPET. That's the slow part because + * the southbridge runs at a snail's pace compared to us and + * we have to wait for the reply. + * Might be possible to use the faster TSC here, but that + * has other drawbacks. + */ + cmp = hpet_readl(HPET_T0_CMP); + count = hpet_readl(HPET_COUNTER); + offset = target - jiffies - 1; + + /* How much left in current jiffie? */ + left_now = hpet_tick - count%hpet_tick; + + /* RED-PEN may want to round up here slightly to make sure the timer + triggers rather too late than too early. */ + + newcmp = count + left_now + offset * hpet_tick; + /* Should be ok to ignore wrap */ + +#define FMT(x) (x), (x)/(unsigned)hpet_tick, (x)%(unsigned)hpet_tick + Xprintk( + "hpet_delay_timer count %u cmp +%u(%u.%u)\n" + " newcmp +%u(%u.%u) offset %lu left_now %u(%u.%u)\n", + count, + FMT(cmp - count), + FMT(newcmp - count), + offset, + FMT(left_now)); + + hpet_writel(newcmp, HPET_T0_CMP); +} + +/* Let timer fire on next tick again */ +static noinline void hpet_restart_timer(void) +{ + unsigned count, left_now; + + if (timer_regular()) + return; + +again: + count = hpet_readl(HPET_COUNTER); + + /* How much left in current jiffie? */ + left_now = hpet_tick - count%hpet_tick; + + { + unsigned cmp = hpet_readl(HPET_T0_CMP); + Xprintk("hpet_restart_timer count %u cmp +%u(%u.%u) left_now %u(%u.%u)\n", + count, FMT(cmp - count), FMT(left_now)); + } + + hpet_writel(count + left_now, HPET_T0_CMP); + /* When we're too narrow at expiry double check afterwards */ + if (left_now < hpet_tick / HPET_REPROGRAM_SAFETY && + hpet_readl(HPET_COUNTER) >= count + left_now) + goto again; +} + +static __init int enable_periodic_hpet(char *s) +{ + hpet_periodic = 1; + return 0; +} +__setup("periodichpet", enable_periodic_hpet); + +static int timer_start_stop(struct notifier_block *me, unsigned long cmd, + void *data) +{ + unsigned long flags; + + /* Non BP CPUs use the APIC timer only */ + if (sysctl_hz_timer || smp_processor_id() != 0) + return NOTIFY_DONE; + + /* Don't change timer status for a single interrupt because + we just go back to idle anyways. */ + if (in_interrupt() && !current->pid) { + /* But need to fix up the time for the interrupts */ + if (cmd == IDLE_END) + sync_timer(); + return NOTIFY_DONE; + } + + local_irq_save(flags); + switch (cmd) { + case IDLE_START: + if (vxtime.hpet_address) + hpet_delay_timer(); + else + pit_delay_timer(); + break; + case IDLE_END: + sync_timer(); + if (vxtime.hpet_address) + hpet_restart_timer(); + else + pit_restart_timer(); + break; + } + local_irq_restore(flags); + return NOTIFY_DONE; +} + +static struct notifier_block timer_idle_notifier = { + .notifier_call = timer_start_stop, +}; + +static __init int time_delay_init(void) +{ + idle_notifier_register(&timer_idle_notifier); + return 0; +} +core_initcall(time_delay_init); + +#endif + int __init time_setup(char *str) { report_lost_ticks = 1; @@ -925,7 +1295,7 @@ void __init time_init(void) vxtime.quot = (1000000L << 32) / vxtime_hz; vxtime.tsc_quot = (1000L << 32) / cpu_khz; vxtime.hz = vxtime_hz; - rdtscll_sync(&vxtime.last_tsc); + rdtscll(vxtime.last_tsc); setup_irq(0, &irq0); set_cyc2ns_scale(cpu_khz / 1000); Index: linux/arch/x86_64/Kconfig =================================================================== --- linux.orig/arch/x86_64/Kconfig +++ linux/arch/x86_64/Kconfig @@ -321,6 +321,17 @@ config HPET_EMULATE_RTC bool "Provide RTC interrupt" depends on HPET_TIMER && RTC=y +config NO_IDLE_HZ + bool "Disable timer interrupt in idle" + depends on EXPERIMENTAL + help + Switches the regular timer interrupt off when the system + is idle. This can lower power usage and improve performance + in virtualized systems. However it adds more overhead for + interrupts in the kernel. Can be controlled at runtime using the + /proc/sys/kernel/hz_timer sysctl. + This is still an experimental feature. + config GART_IOMMU bool "IOMMU support" default y Index: linux/arch/x86_64/kernel/smp.c =================================================================== --- linux.orig/arch/x86_64/kernel/smp.c +++ linux/arch/x86_64/kernel/smp.c @@ -28,6 +28,7 @@ #include #include #include +#include #define __cpuinit __init @@ -134,6 +135,8 @@ asmlinkage void smp_invalidate_interrupt int cpu; int sender; union smp_flush_state *f; + if (in_idle()) + exit_idle(); cpu = smp_processor_id(); /* @@ -428,6 +431,8 @@ void smp_send_stop(void) asmlinkage void smp_reschedule_interrupt(void) { ack_APIC_irq(); + if (in_idle()) + exit_idle(); } asmlinkage void smp_call_function_interrupt(void) @@ -440,6 +445,8 @@ asmlinkage void smp_call_function_interr atomic_inc(&data->started); mb(); ack_APIC_irq(); + if (in_idle()) + exit_idle(); irq_enter(); func(info); irq_exit(); Index: linux/include/asm-x86_64/idle.h =================================================================== --- /dev/null +++ linux/include/asm-x86_64/idle.h @@ -0,0 +1,24 @@ +#ifndef _ASM_X86_64_IDLE_H +#define _ASM_X86_64_IDLE_H 1 + +/* Infrastructure for disabling timer ticks in the idle loop. */ + +#include +#include + +#define IDLE_START 1 +#define IDLE_END 2 +struct notifier_block; +void idle_notifier_register(struct notifier_block *n); +void enter_idle(void); +void exit_idle(void); + +/* Needs to be special cased later when we ever do no tick in non idle */ +#define irq_enter_idle enter_idle + +static inline int in_idle(void) +{ + return ((current->pid | read_pda(irqcount)) == 0); +} + +#endif Index: linux/arch/x86_64/kernel/irq.c =================================================================== --- linux.orig/arch/x86_64/kernel/irq.c +++ linux/arch/x86_64/kernel/irq.c @@ -16,6 +16,7 @@ #include #include #include +#include atomic_t irq_err_count; #ifdef CONFIG_X86_IO_APIC @@ -98,11 +99,17 @@ asmlinkage unsigned int do_IRQ(struct pt unsigned irq = regs->orig_rax & 0xff; irq_enter(); + + if (in_idle()) + exit_idle(); + BUG_ON(irq > 256); __do_IRQ(irq, regs); irq_exit(); + /* enter_idle is done by idle itself again */ + return 1; } Index: linux/include/asm-x86_64/proto.h =================================================================== --- linux.orig/include/asm-x86_64/proto.h +++ linux/include/asm-x86_64/proto.h @@ -89,6 +89,9 @@ extern int unhandled_signal(struct task_ extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void swiotlb_init(void); +extern void stop_timer_interrupt(void); +extern int main_timer_handler(struct pt_regs *regs); + extern unsigned long max_mapnr; extern unsigned long end_pfn; extern unsigned long table_start, table_end; Index: linux/arch/x86_64/kernel/nmi.c =================================================================== --- linux.orig/arch/x86_64/kernel/nmi.c +++ linux/arch/x86_64/kernel/nmi.c @@ -473,6 +473,9 @@ void nmi_watchdog_tick (struct pt_regs * __get_cpu_var(nmi_touch) = 0; touched = 1; } + if (current->pid == 0 && read_pda(irqcount) == -1 && + (regs->eflags & EF_IE)) + touched = 1; if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... Index: linux/include/asm-x86_64/hpet.h =================================================================== --- linux.orig/include/asm-x86_64/hpet.h +++ linux/include/asm-x86_64/hpet.h @@ -47,6 +47,7 @@ extern int is_hpet_enabled(void); extern int hpet_rtc_timer_init(void); extern int oem_force_hpet_timer(void); +extern unsigned long hpet_tick; #ifdef CONFIG_HPET_EMULATE_RTC extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);