Disable timer tick in idle Don't run the regular timer tick while the system is idle. This is useful to save power (doesn't yet) and for virtualization. Using some infrastructure from s390. Experimental. Still WIP. HPET is currently broken Signed-off-by: Andi Kleen Index: linux/arch/x86_64/kernel/apic.c =================================================================== --- linux.orig/arch/x86_64/kernel/apic.c +++ linux/arch/x86_64/kernel/apic.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -33,14 +34,19 @@ #include #include #include +#include +#include int apic_verbosity; +int apic_runs_main_timer; int disable_apic_timer __initdata; /* Using APIC to generate smp_local_timer_interrupt? */ int using_apic_timer = 0; +static unsigned int calibration_result; + static DEFINE_PER_CPU(int, prof_multiplier) = 1; static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; static DEFINE_PER_CPU(int, prof_counter) = 1; @@ -644,6 +650,69 @@ void __init init_apic_mappings(void) #endif } +#ifdef CONFIG_NO_IDLE_HZ + +extern int sysctl_hz_timer; + +/* Delay timer */ +static void apic_delay_timer(void) +{ + long offset; + long left = apic_read(APIC_TMCCT); + long left_now = left % calibration_result; + long ticks; + + if (!in_interrupt() && local_softirq_pending()) + offset = 0; + else if (rcu_pending(smp_processor_id())) + offset = 0; + else + offset = next_timer_interrupt() - jiffies - 1; + ticks = left_now + offset*calibration_result; + if (ticks >= 0xffffff) + ticks = left_now + + (((0xffffff - left_now)/calibration_result)*calibration_result); + if (ticks != left) + apic_write(APIC_TMCCT, ticks); +} + +/* Let timer fire on next tick again */ +static void apic_restart_timer(void) +{ + unsigned long left; + left = apic_read(APIC_TMCCT); + if (left < calibration_result) + return; + apic_write(APIC_TMCCT, left % calibration_result); +} + +static int apic_timer_start_stop(struct notifier_block *me, unsigned long cmd, + void *data) +{ + unsigned long flags; + + if (sysctl_hz_timer || + (smp_processor_id() == boot_cpu_id && !apic_runs_main_timer)) + return NOTIFY_DONE; + + local_irq_save(flags); + switch (cmd) { + case IDLE_START: + apic_delay_timer(); + break; + case IDLE_END: + apic_restart_timer(); + break; + } + local_irq_restore(flags); + return NOTIFY_DONE; +} + +static struct notifier_block apic_timer_notifier = { + .notifier_call = apic_timer_start_stop, +}; +#endif + /* * This function sets up the local APIC timer, with a timeout of * 'clocks' APIC bus clock. During calibration we actually call @@ -657,7 +726,12 @@ void __init init_apic_mappings(void) #define APIC_DIVISOR 16 -static void __setup_APIC_LVTT(unsigned int clocks) +void change_apic_timer(unsigned clocks) +{ + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); +} + +void __setup_APIC_LVTT(unsigned int clocks) { unsigned int lvtt_value, tmp_value, ver; @@ -672,8 +746,7 @@ static void __setup_APIC_LVTT(unsigned i apic_write_around(APIC_TDCR, (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | APIC_TDR_DIV_16); - - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); + change_apic_timer(clocks); } static void setup_APIC_timer(unsigned int clocks) @@ -682,25 +755,19 @@ static void setup_APIC_timer(unsigned in local_irq_save(flags); - /* For some reasons this doesn't work on Simics, so fake it for now */ - if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { - __setup_APIC_LVTT(clocks); - return; - } + printk("setup_APIC_timer\n"); /* wait for irq slice */ if (vxtime.hpet_address) { - int trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_COUNTER) >= trigger) - /* do nothing */ ; - while (hpet_readl(HPET_COUNTER) < trigger) - /* do nothing */ ; + int tick = hpet_readl(HPET_COUNTER) / hpet_tick; + while (hpet_readl(HPET_COUNTER)/hpet_tick == tick) + /* no cpu relax to keep latency down */; } else { int c1, c2; outb_p(0x00, 0x43); c2 = inb_p(0x40); c2 |= inb_p(0x40) << 8; - do { + do { c1 = c2; outb_p(0x00, 0x43); c2 = inb_p(0x40); @@ -708,9 +775,12 @@ static void setup_APIC_timer(unsigned in } while (c2 - c1 < 300); } - __setup_APIC_LVTT(clocks); - + if (apic_runs_main_timer) + stop_timer_interrupt(); + if (smp_processor_id() != boot_cpu_id) + __setup_APIC_LVTT(clocks); local_irq_restore(flags); + printk("done\n"); } /* @@ -755,8 +825,6 @@ static int __init calibrate_APIC_clock(v return result * APIC_DIVISOR / HZ; } -static unsigned int calibration_result; - void __init setup_boot_APIC_clock (void) { if (disable_apic_timer) { @@ -775,6 +843,9 @@ void __init setup_boot_APIC_clock (void) */ setup_APIC_timer(calibration_result); +#ifdef CONFIG_NO_IDLE_HZ + idle_notifier_register(&apic_timer_notifier); +#endif local_irq_enable(); } @@ -845,10 +916,8 @@ int setup_profiling_timer(unsigned int m * value into /proc/profile. */ -void smp_local_timer_interrupt(struct pt_regs *regs) +void apic_profile_tick(int cpu, struct pt_regs *regs) { - int cpu = smp_processor_id(); - profile_tick(CPU_PROFILING, regs); if (--per_cpu(prof_counter, cpu) <= 0) { /* @@ -862,16 +931,29 @@ void smp_local_timer_interrupt(struct pt per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); if (per_cpu(prof_counter, cpu) != per_cpu(prof_old_multiplier, cpu)) { - __setup_APIC_LVTT(calibration_result/ + change_apic_timer(calibration_result/ per_cpu(prof_counter, cpu)); per_cpu(prof_old_multiplier, cpu) = per_cpu(prof_counter, cpu); } -#ifdef CONFIG_SMP - update_process_times(user_mode(regs)); -#endif + if (cpu != boot_cpu_id) + update_process_times(user_mode(regs)); } +} + +void smp_local_timer_interrupt(struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + int idle = in_idle(); + + if (idle) + exit_idle(); + + if (apic_runs_main_timer && smp_processor_id() == boot_cpu_id) + main_timer_handler(regs); + else + apic_profile_tick(cpu, regs); /* * We take the 'long' return path, and there every subsystem @@ -895,11 +977,16 @@ void smp_local_timer_interrupt(struct pt */ void smp_apic_timer_interrupt(struct pt_regs *regs) { + int idle = in_idle(); + /* * the NMI deadlock-detector uses this. */ add_pda(apic_timer_irqs, 1); + if (idle) + exit_idle(); + /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. @@ -968,6 +1055,7 @@ __init int oem_force_hpet_timer(void) */ asmlinkage void smp_spurious_interrupt(void) { + int idle = in_idle(); unsigned int v; irq_enter(); /* @@ -994,6 +1082,9 @@ asmlinkage void smp_spurious_interrupt(v } #endif irq_exit(); + + if (idle) + exit_idle(); } /* @@ -1005,6 +1096,10 @@ asmlinkage void smp_error_interrupt(void unsigned int v, v1; irq_enter(); + + if (in_idle()) + exit_idle(); + /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); @@ -1024,6 +1119,7 @@ asmlinkage void smp_error_interrupt(void */ printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", smp_processor_id(), v , v1); + irq_exit(); } @@ -1083,6 +1179,13 @@ static __init int setup_noapictimer(char return 0; } +static __init int setup_apicmaintimer(char *str) +{ + apic_runs_main_timer = 1; + return 0; +} +__setup("apicmaintimer", setup_apicmaintimer); + /* dummy parsing: see setup.c */ __setup("disableapic", setup_disableapic); Index: linux/include/asm-x86_64/apic.h =================================================================== --- linux.orig/include/asm-x86_64/apic.h +++ linux/include/asm-x86_64/apic.h @@ -17,6 +17,7 @@ #define APIC_DEBUG 2 extern int apic_verbosity; +extern int apic_runs_main_timer; /* * Define the default level of output to be very little @@ -98,6 +99,7 @@ extern int APIC_init_uniprocessor (void) extern void disable_APIC_timer(void); extern void enable_APIC_timer(void); extern void clustered_apic_check(void); +extern void apic_profile_tick(int, struct pt_regs *); extern void nmi_watchdog_default(void); extern int setup_nmi_watchdog(char *); Index: linux/arch/x86_64/kernel/process.c =================================================================== --- linux.orig/arch/x86_64/kernel/process.c +++ linux/arch/x86_64/kernel/process.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); @@ -80,6 +82,32 @@ void enable_hlt(void) EXPORT_SYMBOL(enable_hlt); +static struct notifier_block *idle_notifier; + +void idle_notifier_register(struct notifier_block *n) +{ + /* no locking for now, could be added later if modules + use this */ + notifier_chain_register(&idle_notifier, n); +} + +enum idle_state { CPU_IDLE, CPU_NOT_IDLE }; +static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; + +void enter_idle(void) +{ + if (cmpxchg(&__get_cpu_var(idle_state), CPU_NOT_IDLE, CPU_IDLE) == + CPU_NOT_IDLE) + notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +void exit_idle(void) +{ + if (cmpxchg(&__get_cpu_var(idle_state), CPU_IDLE, CPU_NOT_IDLE) == + CPU_IDLE) + notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + /* * We use this if we don't have any better * idle routine.. @@ -126,6 +154,7 @@ static void poll_idle (void) } else { set_need_resched(); } + exit_idle(); } void cpu_idle_wait(void) @@ -195,6 +224,7 @@ void cpu_idle (void) if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; + enter_idle(); rmb(); idle = pm_idle; if (!idle) @@ -202,6 +232,7 @@ void cpu_idle (void) if (cpu_is_offline(smp_processor_id())) play_dead(); idle(); + exit_idle(); } schedule(); Index: linux/arch/x86_64/kernel/io_apic.c =================================================================== --- linux.orig/arch/x86_64/kernel/io_apic.c +++ linux/arch/x86_64/kernel/io_apic.c @@ -113,6 +113,11 @@ static void set_ioapic_affinity_irq(unsi cpus_and(mask, tmp, CPU_MASK_ALL); + /* Don't allow to redirect timer from irq 0. + Should add error return */ + if (irq == 0) + return; + dest = cpu_mask_to_apicid(mask); /* Index: linux/kernel/sysctl.c =================================================================== --- linux.orig/kernel/sysctl.c +++ linux/kernel/sysctl.c @@ -555,6 +555,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#endif #ifdef CONFIG_NO_IDLE_HZ { .ctl_name = KERN_HZ_TIMER, @@ -565,6 +566,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_ARCH_S390 { .ctl_name = KERN_S390_USER_DEBUG_LOGGING, .procname = "userprocess_debug", Index: linux/arch/x86_64/kernel/time.c =================================================================== --- linux.orig/arch/x86_64/kernel/time.c +++ linux/arch/x86_64/kernel/time.c @@ -1,5 +1,5 @@ /* - * linux/arch/x86-64/kernel/time.c + * l5Binux/arch/x86-64/kernel/time.c * * "High Precision Event Timer" based timekeeping. * @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #ifdef CONFIG_X86_LOCAL_APIC @@ -50,7 +51,6 @@ EXPORT_SYMBOL(jiffies_64); static void cpufreq_delayed_get(void); #endif extern void i8254_timer_resume(void); -extern int using_apic_timer; DEFINE_SPINLOCK(rtc_lock); DEFINE_SPINLOCK(i8253_lock); @@ -59,6 +59,7 @@ static int nohpet __initdata = 0; static int notsc __initdata = 0; #undef HPET_HACK_ENABLE_DANGEROUS +#define HPET_REPROGRAM_SAFETY 20 /* 5% of hpet_tick */ unsigned int cpu_khz; /* TSC clocks / usec, not used here */ static unsigned long hpet_period; /* fsecs / HPET clock */ @@ -68,6 +69,12 @@ unsigned long vxtime_hz = PIT_TICK_RATE; int report_lost_ticks; /* command line option */ unsigned long long monotonic_base; +#ifdef CONFIG_NO_IDLE_HZ +static int hpet_periodic = 0; +#else +static int hpet_periodic = 1; +#endif + struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; @@ -75,13 +82,14 @@ unsigned long __wall_jiffies __section_w struct timespec __xtime __section_xtime; struct timezone __sys_tz __section_sys_tz; -static inline void rdtscll_sync(unsigned long *tsc) -{ -#ifdef CONFIG_SMP - sync_core(); -#endif - rdtscll(*tsc); -} +static int __sync_timer(struct pt_regs *regs); + +int xprintk = 0; +int xprintk_count = 200; +#define Xprintk(fmt...) do { if (xprintk && xprintk_count > 0) \ + printk(fmt), xprintk_count--; } while(0) +static int enable_xprintk(char *s) { xprintk = 1; return 0; } +__setup("xprintk",enable_xprintk); /* * do_gettimeoffset() returns microseconds since last timer interrupt was @@ -93,16 +101,24 @@ static inline void rdtscll_sync(unsigned * together by xtime_lock. */ -static inline unsigned int do_gettimeoffset_tsc(void) +static inline unsigned int __do_gettimeoffset_tsc(void) { unsigned long t; unsigned long x; - rdtscll_sync(&t); + rdtscll(t); if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; return x; } +static inline unsigned int do_gettimeoffset_tsc(void) +{ +#ifdef CONFIG_SMP + sync_core(); +#endif + return __do_gettimeoffset_tsc(); +} + static inline unsigned int do_gettimeoffset_hpet(void) { /* cap counter read to one tick to avoid inconsistencies */ @@ -301,7 +317,6 @@ unsigned long long monotonic_clock(void) last_offset = vxtime.last; base = monotonic_base; this_offset = hpet_readl(HPET_COUNTER); - } while (read_seqretry(&xtime_lock, seq)); offset = (this_offset - last_offset); offset *=(NSEC_PER_SEC/HZ)/hpet_tick; @@ -328,6 +343,10 @@ static noinline void handle_lost_ticks(i static long lost_count; static int warned; +#ifdef CONFIG_NO_IDLE_HZ + return; +#endif + if (report_lost_ticks) { printk(KERN_WARNING "time.c: Lost %d timer " "tick(s)! ", lost); @@ -361,11 +380,14 @@ static noinline void handle_lost_ticks(i #endif } -static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +unsigned pit_ch0_init = LATCH; + +int main_timer_handler(struct pt_regs *regs) { static unsigned long rtc_update = 0; unsigned long tsc; - int delay, offset = 0, lost = 0; + int delay, offset = 0, lost = 0, count = 0; + int i; /* * Here we are in the timer irq handler. We have irqs locally disabled (so we @@ -376,16 +398,19 @@ static irqreturn_t timer_interrupt(int i write_seqlock(&xtime_lock); + again: if (vxtime.hpet_address) - offset = hpet_readl(HPET_COUNTER); + count = offset = hpet_readl(HPET_COUNTER); if (hpet_use_timer) { /* if we're using the hpet timer functionality, * we can more accurately know the counter value * when the timer interrupt occured. */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - delay = hpet_readl(HPET_COUNTER) - offset; + offset = hpet_readl(HPET_T0_CMP); + if (hpet_periodic) + offset -= hpet_tick; + delay = count - offset; } else { spin_lock(&i8253_lock); outb_p(0x00, 0x43); @@ -395,7 +420,7 @@ static irqreturn_t timer_interrupt(int i delay = LATCH - 1 - delay; } - rdtscll_sync(&tsc); + rdtscll(tsc); if (vxtime.mode == VXTIME_HPET) { if (offset - vxtime.last > hpet_tick) { @@ -432,34 +457,34 @@ static irqreturn_t timer_interrupt(int i (((long) offset << 32) / vxtime.tsc_quot) - 1; } - if (lost > 0) { - handle_lost_ticks(lost, regs); - jiffies += lost; + for (i = 0; i <= lost; i++) { + update_process_times(regs ? user_mode(regs) : 0); + do_timer(regs); + } + + if (vxtime.hpet_address && !hpet_periodic) { + int to_next = hpet_tick - count%hpet_tick; + int next = count + to_next; + hpet_writel(next, HPET_T0_CMP); + Xprintk("count %u next +%d(%u.%u)\n", count, next - count, + (next-count)/hpet_tick, (next-count)%hpet_tick); + /* Paranoia: make sure to not underrun cmp */ + if (to_next < hpet_tick / HPET_REPROGRAM_SAFETY) { + if (hpet_readl(HPET_COUNTER) >= next) { + printk("hpet underrun!\n"); + goto again; + } + } + } + + /* On the boot CPU we do the job of the APIC timer interrupt + too */ + if (regs && (!using_apic_timer || smp_processor_id() == boot_cpu_id)) { + add_pda(apic_timer_irqs, 1); + apic_profile_tick(smp_processor_id(), regs); } /* - * Do the timer stuff. - */ - - do_timer(regs); -#ifndef CONFIG_SMP - update_process_times(user_mode(regs)); -#endif - -/* - * In the SMP case we use the local APIC timer interrupt to do the profiling, - * except when we simulate SMP mode on a uniprocessor system, in that case we - * have to call the local interrupt handler. - */ - -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else - if (!using_apic_timer) - smp_local_timer_interrupt(regs); -#endif - -/* * If we have an externally synchronized Linux clock, then update CMOS clock * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy * closest to exactly 500 ms before the next second. If the update fails, we @@ -475,9 +500,24 @@ static irqreturn_t timer_interrupt(int i write_sequnlock(&xtime_lock); + if (lost > 0) + handle_lost_ticks(lost, regs); + return IRQ_HANDLED; } +#ifndef CONFIG_NO_IDLE_HZ +static inline int __sync_timer(struct pt_regs *regs) +{ + return main_timer_handler(regs); +} +#endif + +static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + return __sync_timer(regs); +} + static unsigned int cyc2ns_scale; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ @@ -810,10 +850,13 @@ static int hpet_timer_stop_set_go(unsign * and period also hpet_tick. */ if (hpet_use_timer) { - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | - HPET_TN_32BIT, HPET_T0_CFG); + hpet_writel(HPET_TN_ENABLE | + (hpet_periodic ? HPET_TN_PERIODIC : 0) + | HPET_TN_SETVAL | HPET_TN_32BIT, HPET_T0_CFG); hpet_writel(hpet_tick, HPET_T0_CMP); - hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ + /* Write accumulator for periodic mode */ + if (hpet_periodic) + hpet_writel(hpet_tick, HPET_T0_CMP); cfg |= HPET_CFG_LEGACY; } /* @@ -861,17 +904,338 @@ static int hpet_reenable(void) return hpet_timer_stop_set_go(hpet_tick); } -void __init pit_init(void) +#define PIT_MODE 0x43 +#define PIT_CH0 0x40 + +static void __init pit_stop_interrupt(void) { unsigned long flags; - spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34, 0x43); /* binary, mode 2, LSB/MSB, ch 0 */ - outb_p(LATCH & 0xff, 0x40); /* LSB */ - outb_p(LATCH >> 8, 0x40); /* MSB */ + + /* Set rate generator mode without interrupt */ + outb_p((2 << 1) | (3 << 4), PIT_MODE); + outb_p(1, PIT_CH0); + outb(0, PIT_CH0); spin_unlock_irqrestore(&i8253_lock, flags); } +void __init stop_timer_interrupt(void) +{ + if (vxtime.hpet_address) + hpet_timer_stop_set_go(0); + else + pit_stop_interrupt(); + printk("------- timer interrupt stopped\n"); +} + +/* + * Reprograms the next timer interrupt + * PIT timer reprogramming code taken from APM code. + * [and taken here from T.Lindgren's patch] + * Note that PIT timer is a 16-bit timer, which allows max + * skip of only few seconds. + * Must run with interrupts off + */ +void reprogram_pit_timer(int skip) +{ + spin_lock(&i8253_lock); + outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(skip & 0xff, PIT_CH0); /* LSB */ + outb(skip >> 8, PIT_CH0); /* MSB */ + pit_ch0_init = skip; + spin_unlock(&i8253_lock); +} + +void __init pit_init(void) +{ + unsigned long flags; + local_irq_save(flags); + reprogram_pit_timer(LATCH); + local_irq_restore(flags); +} + +#ifdef CONFIG_NO_IDLE_HZ +int sysctl_hz_timer = 0; + +static __init int setup_hztimer(char *s) +{ + sysctl_hz_timer = 1; + return 0; +} +__setup("hztimer", setup_hztimer); + +enum timer_state { + TIMER_SKIPPING, + TIMER_NORMAL +}; +/* RED-PEN is it still normal when per cpu data is cloned? CHECKME */ +static DEFINE_PER_CPU(enum timer_state, timer_state) = TIMER_NORMAL; +static DEFINE_PER_CPU(unsigned long, jiffies_target); + +/* Alternatively could read the PIT here, but that would be much slower. */ +static int pit_left_in_jiffie(void) +{ + int left = 1000000/HZ - __do_gettimeoffset_tsc(); + if (left < 0) /* Lost a timer tick. Should not happen */ + left = 0; + return left / PIT_TICK_RATE; +} + +static noinline unsigned long timer_target(void) +{ + /* not correct - need a new mechanism to tell RCU about idle CPUs */ + if (rcu_pending(smp_processor_id())) + return jiffies + 1; + /* Expensive. It would be better if add_timer told us + when something changed. */ + return next_timer_interrupt(); +} + +/* Optimize away unnecessary timer state changes. These timers are off CPU + and always hundreds or even thousands of cycles away. So it is + better to touch them as rarely as possible. */ + +static int delay_needed(unsigned long target, unsigned long *old_target) +{ + if (__get_cpu_var(timer_state) == TIMER_SKIPPING) + *old_target = __get_cpu_var(jiffies_target); + else + *old_target = jiffies + 1; + if (target == *old_target) + return 0; + __get_cpu_var(jiffies_target) = target; + __get_cpu_var(timer_state) = TIMER_SKIPPING; + return 1; +} + +static noinline int timer_regular(void) +{ + if (__get_cpu_var(timer_state) == TIMER_NORMAL) + return 1; + __get_cpu_var(timer_state) = TIMER_NORMAL; + if (__get_cpu_var(jiffies_target) == jiffies+1) + return 1; + return 0; +} + +/* + * Delay PIT timer. + * Runs from each interrupt on Boot CPU. + * Runs with interrupts off. + * + * The regular reprogramming of the PIT with data from the TSC timer + * is unfortunate and probably will add regular errors and result in + * much more drift because we still use the PIT data for main time keeping. + * It's really bad that we cannot rely on the presence of the HPET timer + * here due some chipsets just not implementing it :/ + * + * In theory we could switch main time keeping over to PM in this + * case which should run independently. The problem is that most PM + * timers are only 24bit and they wrap every 4s, but that may be acceptable + * because our delay times should be usually less. Double check this. + */ + +static noinline void pit_delay_timer(void) +{ + int offset; + long target = timer_target(); + unsigned long old_target; + + if (!delay_needed(target, &old_target)) + return; + + offset = (int)(target - jiffies - 1)*LATCH + pit_left_in_jiffie(); + if (offset >= 0xffff) + offset = (0xfffe / LATCH) * LATCH; + + Xprintk("pit_delay_timer offset %u cur %lu target %lu skipping %lu\n", + offset, jiffies, target, target - jiffies); + + reprogram_pit_timer(offset); +} + +static void pit_restart_timer(void) +{ + if (timer_regular()) + return; + + reprogram_pit_timer(pit_left_in_jiffie()); +} + +/* + * Check if we lost enough time that we need to update jiffies or + * xtime again. + * + * The subtle point of this is that it results in the main timer handler + * being called on all CPUs, not only on the boot CPU. We do some locking + * to make sure they don't do it more often than once per jiffie. + * + * Weak point is the reliance on the TSC when their frequencies vary + * with powernow and which may stop in C3. + */ +static int __sync_timer(struct pt_regs *regs) +{ + static DEFINE_SPINLOCK(sync_timer_lock); + long offset; + + Xprintk("sync_timer jiffies %lu count %lu\n", jiffies, + vxtime.hpet_address ? hpet_readl(HPET_COUNTER) : 0); + + offset = __do_gettimeoffset_tsc(); + if (offset >= 1000000/HZ) { + spin_lock(&sync_timer_lock); +#ifdef CONFIG_SMP + /* Did another CPU have the same idea and got the lock + earlier? */ + if (__do_gettimeoffset_tsc() < 1000000/HZ) { + spin_unlock(&sync_timer_lock); + return IRQ_HANDLED; + } +#endif + main_timer_handler(regs); + spin_unlock(&sync_timer_lock); + } + return IRQ_HANDLED; +} + +static void sync_timer(void) +{ + if (__get_cpu_var(timer_state) != TIMER_SKIPPING) + return; + __sync_timer(NULL); +} + +/* HPET timer delay support */ + +static noinline void hpet_delay_timer(void) +{ + unsigned long offset; + unsigned long target = timer_target(); + unsigned long old_target; + unsigned cmp, count, newcmp; + int left_now; + + if (hpet_periodic || !delay_needed(target, &old_target)) + return; + + /* + * Get current state of HPET. That's the slow part because + * the southbridge runs at a snail's pace compared to us and + * we have to wait for the reply. + * Might be possible to use the faster TSC here, but that + * has other drawbacks. + */ + cmp = hpet_readl(HPET_T0_CMP); + count = hpet_readl(HPET_COUNTER); + offset = target - jiffies - 1; + + /* How much left in current jiffie? */ + left_now = hpet_tick - count%hpet_tick; + + /* RED-PEN may want to round up here slightly to make sure the timer + triggers rather too late than too early. */ + + newcmp = count + left_now + offset * hpet_tick; + /* Should be ok to ignore wrap */ + +#define FMT(x) (x), (x)/(unsigned)hpet_tick, (x)%(unsigned)hpet_tick + Xprintk( + "hpet_delay_timer count %u cmp +%u(%u.%u)\n" + " newcmp +%u(%u.%u) offset %lu left_now %u(%u.%u)\n", + count, + FMT(cmp - count), + FMT(newcmp - count), + offset, + FMT(left_now)); + + hpet_writel(newcmp, HPET_T0_CMP); +} + +/* Let timer fire on next tick again */ +static noinline void hpet_restart_timer(void) +{ + unsigned count, left_now; + + if (timer_regular()) + return; + +again: + count = hpet_readl(HPET_COUNTER); + + /* How much left in current jiffie? */ + left_now = hpet_tick - count%hpet_tick; + + { + unsigned cmp = hpet_readl(HPET_T0_CMP); + Xprintk("hpet_restart_timer count %u cmp +%u(%u.%u) left_now %u(%u.%u)\n", + count, FMT(cmp - count), FMT(left_now)); + } + + hpet_writel(count + left_now, HPET_T0_CMP); + /* When we're too narrow at expiry double check afterwards */ + if (left_now < hpet_tick / HPET_REPROGRAM_SAFETY && + hpet_readl(HPET_COUNTER) >= count + left_now) + goto again; +} + +static __init int enable_periodic_hpet(char *s) +{ + hpet_periodic = 1; + return 0; +} +__setup("periodichpet", enable_periodic_hpet); + +static int timer_start_stop(struct notifier_block *me, unsigned long cmd, + void *data) +{ + unsigned long flags; + + /* Non BP CPUs use the APIC timer only */ + if (sysctl_hz_timer || smp_processor_id() != 0) + return NOTIFY_DONE; + + /* Don't change timer status for a single interrupt because + we just go back to idle anyways. */ + if (in_interrupt() && !current->pid) { + /* But need to fix up the time for the interrupts */ + if (cmd == IDLE_END) + sync_timer(); + return NOTIFY_DONE; + } + + local_irq_save(flags); + switch (cmd) { + case IDLE_START: + if (vxtime.hpet_address) + hpet_delay_timer(); + else + pit_delay_timer(); + break; + case IDLE_END: + sync_timer(); + if (vxtime.hpet_address) + hpet_restart_timer(); + else + pit_restart_timer(); + break; + } + local_irq_restore(flags); + return NOTIFY_DONE; +} + +static struct notifier_block timer_idle_notifier = { + .notifier_call = timer_start_stop, +}; + +static __init int time_delay_init(void) +{ + idle_notifier_register(&timer_idle_notifier); + return 0; +} +core_initcall(time_delay_init); + +#endif + int __init time_setup(char *str) { report_lost_ticks = 1; @@ -936,7 +1300,7 @@ void __init time_init(void) vxtime.mode = VXTIME_TSC; vxtime.quot = (1000000L << 32) / vxtime_hz; vxtime.tsc_quot = (1000L << 32) / cpu_khz; - rdtscll_sync(&vxtime.last_tsc); + rdtscll(vxtime.last_tsc); setup_irq(0, &irq0); set_cyc2ns_scale(cpu_khz / 1000); Index: linux/arch/x86_64/Kconfig =================================================================== --- linux.orig/arch/x86_64/Kconfig +++ linux/arch/x86_64/Kconfig @@ -334,6 +334,17 @@ config HPET_EMULATE_RTC bool "Provide RTC interrupt" depends on HPET_TIMER && RTC=y +config NO_IDLE_HZ + bool "Disable timer interrupt in idle" + depends on EXPERIMENTAL + help + Switches the regular timer interrupt off when the system + is idle. This can lower power usage and improve performance + in virtualized systems. However it adds more overhead for + interrupts in the kernel. Can be controlled at runtime using the + /proc/sys/kernel/hz_timer sysctl. + This is still an experimental feature. + config GART_IOMMU bool "IOMMU support" default y Index: linux/arch/x86_64/kernel/smp.c =================================================================== --- linux.orig/arch/x86_64/kernel/smp.c +++ linux/arch/x86_64/kernel/smp.c @@ -27,6 +27,7 @@ #include #include #include +#include #define __cpuinit __init @@ -134,6 +135,9 @@ asmlinkage void smp_invalidate_interrupt int sender; union smp_flush_state *f; + if (in_idle()) + exit_idle(); + cpu = smp_processor_id(); /* * orig_rax contains the interrupt vector - 256. @@ -495,6 +499,8 @@ void smp_send_stop(void) asmlinkage void smp_reschedule_interrupt(void) { ack_APIC_irq(); + if (in_idle()) + exit_idle(); } asmlinkage void smp_call_function_interrupt(void) @@ -504,6 +510,9 @@ asmlinkage void smp_call_function_interr int wait = call_data->wait; ack_APIC_irq(); + if (in_idle()) + exit_idle(); + /* * Notify initiating CPU that I've grabbed the data and am * about to execute the function Index: linux/include/asm-x86_64/idle.h =================================================================== --- /dev/null +++ linux/include/asm-x86_64/idle.h @@ -0,0 +1,24 @@ +#ifndef _ASM_X86_64_IDLE_H +#define _ASM_X86_64_IDLE_H 1 + +/* Infrastructure for disabling timer ticks in the idle loop. */ + +#include +#include + +#define IDLE_START 1 +#define IDLE_END 2 +struct notifier_block; +void idle_notifier_register(struct notifier_block *n); +void enter_idle(void); +void exit_idle(void); + +/* Needs to be special cased later when we ever do no tick in non idle */ +#define irq_enter_idle enter_idle + +static inline int in_idle(void) +{ + return ((current->pid | read_pda(irqcount)) == 0); +} + +#endif Index: linux/arch/x86_64/kernel/irq.c =================================================================== --- linux.orig/arch/x86_64/kernel/irq.c +++ linux/arch/x86_64/kernel/irq.c @@ -17,6 +17,7 @@ #include #include #include +#include atomic_t irq_err_count; #ifdef CONFIG_X86_IO_APIC @@ -99,10 +100,15 @@ asmlinkage unsigned int do_IRQ(struct pt unsigned irq = regs->orig_rax & 0xff; irq_enter(); + + if (in_idle()) + exit_idle(); __do_IRQ(irq, regs); irq_exit(); + /* enter_idle is done by idle itself again */ + return 1; } Index: linux/include/asm-x86_64/proto.h =================================================================== --- linux.orig/include/asm-x86_64/proto.h +++ linux/include/asm-x86_64/proto.h @@ -61,6 +61,9 @@ extern void free_bootmem_generic(unsigne extern void load_gs_index(unsigned gs); +extern void stop_timer_interrupt(void); +extern int main_timer_handler(struct pt_regs *regs); + extern unsigned long end_pfn_map; extern cpumask_t cpu_initialized; Index: linux/arch/x86_64/kernel/nmi.c =================================================================== --- linux.orig/arch/x86_64/kernel/nmi.c +++ linux/arch/x86_64/kernel/nmi.c @@ -476,6 +476,9 @@ void nmi_watchdog_tick (struct pt_regs * __get_cpu_var(nmi_touch) = 0; touched = 1; } + if (current->pid == 0 && read_pda(irqcount) == -1 && + (regs->eflags & EF_IE)) + touched = 1; if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... Index: linux/include/asm-x86_64/hpet.h =================================================================== --- linux.orig/include/asm-x86_64/hpet.h +++ linux/include/asm-x86_64/hpet.h @@ -47,6 +47,7 @@ extern int is_hpet_enabled(void); extern int hpet_rtc_timer_init(void); extern int oem_force_hpet_timer(void); +extern unsigned long hpet_tick; #ifdef CONFIG_HPET_EMULATE_RTC extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);