Index: Linux-2.6.X/fs/proc/array.c diff -u Linux-2.6.X/fs/proc/array.c:1.1.1.6 Linux-2.6.X/fs/proc/array.c:1.1.1.6.28.1.2.1.8.1 --- Linux-2.6.X/fs/proc/array.c:1.1.1.6 Mon Jul 12 09:41:32 2004 +++ Linux-2.6.X/fs/proc/array.c Mon Aug 2 12:51:38 2004 @@ -155,7 +155,7 @@ read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" + "Burst:\t%d\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -163,7 +163,7 @@ "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), + p->burst, p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, @@ -427,3 +427,26 @@ return sprintf(buffer,"%d %d %d %d %d %d %d\n", size, resident, shared, text, lib, data, 0); } + +int task_cpu_sched_stats(struct task_struct *p, char *buffer) +{ + struct task_sched_stats stats; + unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */ + + read_lock(&tasklist_lock); + get_task_sched_stats(p, &stats); + nvcsw = p->nvcsw; + nivcsw = p-> nivcsw; + cnvcsw = p->cnvcsw; + cnivcsw = p->cnivcsw; + read_unlock(&tasklist_lock); + return sprintf(buffer, + "%llu %llu %llu %llu %llu %lu %lu %lu %lu @ %llu\n", + stats.total_sleep, + stats.total_cpu, + stats.total_delay, + stats.cycle_count, + stats.intr_wake_ups, + nvcsw, nivcsw, cnvcsw, cnivcsw, + stats.timestamp); +} Index: Linux-2.6.X/fs/proc/base.c diff -u Linux-2.6.X/fs/proc/base.c:1.1.1.11 Linux-2.6.X/fs/proc/base.c:1.1.1.11.6.1 --- Linux-2.6.X/fs/proc/base.c:1.1.1.11 Fri Jul 30 11:31:06 2004 +++ Linux-2.6.X/fs/proc/base.c Sun Aug 1 16:17:46 2004 @@ -83,6 +83,7 @@ PROC_TID_MAPS, PROC_TID_MOUNTS, PROC_TID_WCHAN, + PROC_TID_CPU_STATS, #ifdef CONFIG_SECURITY PROC_TID_ATTR, PROC_TID_ATTR_CURRENT, @@ -145,6 +146,7 @@ #ifdef CONFIG_KALLSYMS E(PROC_TID_WCHAN, "wchan", S_IFREG|S_IRUGO), #endif + E(PROC_TID_CPU_STATS, "cpustats", S_IFREG|S_IRUGO), {0,0,NULL,0} }; @@ -180,6 +182,7 @@ int proc_pid_stat(struct task_struct*,char*); int proc_pid_status(struct task_struct*,char*); int proc_pid_statm(struct task_struct*,char*); +extern int task_cpu_sched_stats(struct task_struct *p, char *buffer); static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { @@ -1374,6 +1377,10 @@ ei->op.proc_read = proc_pid_wchan; break; #endif + case PROC_TID_CPU_STATS: + inode->i_fop = &proc_info_file_operations; + ei->op.proc_read = task_cpu_sched_stats; + break; default: printk("procfs: impossible type (%d)",p->type); iput(inode); Index: Linux-2.6.X/fs/proc/proc_misc.c diff -u Linux-2.6.X/fs/proc/proc_misc.c:1.1.1.10 Linux-2.6.X/fs/proc/proc_misc.c:1.1.1.10.6.1 --- Linux-2.6.X/fs/proc/proc_misc.c:1.1.1.10 Fri Jul 30 11:31:06 2004 +++ Linux-2.6.X/fs/proc/proc_misc.c Sun Aug 1 16:17:46 2004 @@ -270,6 +270,37 @@ .release = seq_release, }; +static int cpustats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int i; + int len = 0; + struct cpu_sched_stats total = {0, }; + + for_each_online_cpu(i) { + struct cpu_sched_stats stats; + + get_cpu_sched_stats(i, &stats); + len += sprintf(page + len, "cpu%02d %llu %llu %llu %llu @ %llu\n", i, + stats.total_idle, + stats.total_busy, + stats.total_delay, + stats.nr_switches, + stats.timestamp); + total.total_idle += stats.total_idle; + total.total_busy += stats.total_busy; + total.total_delay += stats.total_delay; + total.nr_switches += stats.nr_switches; + } + len += sprintf(page + len, "total %llu %llu %llu %llu\n", + total.total_idle, + total.total_busy, + total.total_delay, + total.nr_switches); + + return proc_calc_metrics(page, start, off, count, eof, len); +} + extern struct seq_operations vmstat_op; static int vmstat_open(struct inode *inode, struct file *file) { @@ -689,6 +720,7 @@ {"cmdline", cmdline_read_proc}, {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"cpustats", cpustats_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) Index: Linux-2.6.X/include/linux/init_task.h diff -u Linux-2.6.X/include/linux/init_task.h:1.1.1.7 Linux-2.6.X/include/linux/init_task.h:1.1.1.7.4.1.2.1.10.1.2.1 --- Linux-2.6.X/include/linux/init_task.h:1.1.1.7 Fri Jul 30 11:31:52 2004 +++ Linux-2.6.X/include/linux/init_task.h Mon Aug 2 14:27:26 2004 @@ -69,11 +69,14 @@ { \ .state = 0, \ .thread_info = &init_thread_info, \ + .rq = NULL, \ .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ + .prio = MAX_PRIO-21, \ + .static_prio = MAX_PRIO-21, \ + .eb_priority = MAX_PRIO-21, \ + .eb_shares = DEFAULT_EB_SHARES, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ @@ -113,6 +116,7 @@ .proc_lock = SPIN_LOCK_UNLOCKED, \ .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ + .sched_timestamp = ((INITIAL_JIFFIES * NSEC_PER_SEC) / HZ), \ INIT_TASK_PAGG(tsk) \ } Index: Linux-2.6.X/include/linux/sched.h diff -u Linux-2.6.X/include/linux/sched.h:1.1.1.13 Linux-2.6.X/include/linux/sched.h:1.1.1.13.2.1.2.1.2.1.2.1.4.1.2.1 --- Linux-2.6.X/include/linux/sched.h:1.1.1.13 Fri Jul 30 11:31:57 2004 +++ Linux-2.6.X/include/linux/sched.h Mon Aug 2 14:27:26 2004 @@ -126,7 +126,17 @@ #define SCHED_NORMAL 0 #define SCHED_FIFO 1 #define SCHED_RR 2 +#define SCHED_BATCH 3 +#define SCHED_ISO 4 +#define SCHED_MIN 0 +#define SCHED_MAX 4 + +#define SCHED_RANGE(policy) ((policy) >= SCHED_MIN && \ + (policy) <= SCHED_MAX) +#define SCHED_RT(policy) ((policy) == SCHED_FIFO || \ + (policy) == SCHED_RR) + struct sched_param { int sched_priority; }; @@ -314,9 +324,11 @@ #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 40) +#define MAX_PRIO (MAX_RT_PRIO + 41) #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) +#define batch_task(p) ((p)->policy == SCHED_BATCH) +#define iso_task(p) ((p)->policy == SCHED_ISO) /* * Some day this will be a full-fledged user tracking system.. @@ -339,7 +351,7 @@ extern struct user_struct root_user; #define INIT_USER (&root_user) -typedef struct prio_array prio_array_t; +typedef struct runqueue runqueue_t; struct backing_dev_info; struct reclaim_state; @@ -403,6 +415,13 @@ struct audit_context; /* See audit.c */ struct mempolicy; +/* + * For entitlemnet based scheduling a task's shares will be determined from + * their "nice"ness + */ +#define EB_SHARES_PER_NICE 5 +#define DEFAULT_EB_SHARES (20 * EB_SHARES_PER_NICE) + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -414,16 +433,25 @@ int prio, static_prio; struct list_head run_list; - prio_array_t *array; + runqueue_t *rq; - unsigned long sleep_avg; - long interactive_credit; unsigned long long timestamp; - int activated; + unsigned long runtime, totalrun; + unsigned int burst; + + unsigned long long sched_timestamp; + unsigned long long avg_sleep_per_cycle; + unsigned long long avg_delay_per_cycle, avg_delay_per_sub_cycle; + unsigned long long avg_cpu_per_cycle, avg_cpu_per_sub_cycle; + unsigned long long interactive_bonus, throughput_bonus, sub_cycle_count; + unsigned long long cycle_count, total_sleep, total_cpu, total_delay; + unsigned long long sleepiness, cpu_usage_rate, cpu_usage_rate_per_share; + unsigned int eb_priority, eb_shares; + unsigned long long intr_wake_ups; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + unsigned int slice, time_slice; struct list_head tasks; struct list_head ptrace_children; @@ -578,6 +606,9 @@ #define PF_SWAPOFF 0x00080000 /* I am in swapoff */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ +#define PF_FORKED 0x00400000 /* I have just forked */ +#define PF_YIELDED 0x00800000 /* I have just yielded */ +#define PF_UISLEEP 0x01000000 /* Uninterruptible sleep */ #ifdef CONFIG_SMP #define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ @@ -703,6 +734,43 @@ extern unsigned long long sched_clock(void); +/* + * Scheduling statistics for a task/thread + */ +struct task_sched_stats { + unsigned long long timestamp; + unsigned long long cycle_count; + unsigned long long total_sleep; + unsigned long long total_cpu; + unsigned long long total_delay; + unsigned long long intr_wake_ups; +}; + +/* + * Get "up to date" scheduling statistics for the given task + * This function should be used if reliable scheduling statistitcs are required + * outside the scheduler itself as the relevant fields in the task structure + * are not "up to date" NB the possible difference between those in the task + * structure and the correct values could be quite large for sleeping tasks. + */ +extern void get_task_sched_stats(const struct task_struct *tsk, struct task_sched_stats *stats); + +/* + * Scheduling statistics for a CPU + */ +struct cpu_sched_stats { + unsigned long long timestamp; + unsigned long long total_idle; + unsigned long long total_busy; + unsigned long long total_delay; + unsigned long long nr_switches; +}; + +/* + * Get scheduling statistics for the nominated CPU + */ +extern void get_cpu_sched_stats(unsigned int cpu, struct cpu_sched_stats *stats); + /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP extern void sched_exec(void); @@ -1086,10 +1154,7 @@ return p->thread_info->cpu; } -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - p->thread_info->cpu = cpu; -} +void set_task_cpu(struct task_struct *p, unsigned int cpu); #else Index: Linux-2.6.X/include/linux/sysctl.h diff -u Linux-2.6.X/include/linux/sysctl.h:1.1.1.13 Linux-2.6.X/include/linux/sysctl.h:1.1.1.13.6.1 --- Linux-2.6.X/include/linux/sysctl.h:1.1.1.13 Fri Jul 30 11:31:51 2004 +++ Linux-2.6.X/include/linux/sysctl.h Sun Aug 1 16:17:46 2004 @@ -134,6 +134,7 @@ KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */ KERN_HZ_TIMER=65, /* int: hz timer on or off */ KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */ + KERN_CPU_SCHED=67, /* CPU scheduler stuff */ }; Index: Linux-2.6.X/init/main.c diff -u Linux-2.6.X/init/main.c:1.1.1.14 Linux-2.6.X/init/main.c:1.1.1.14.16.1 --- Linux-2.6.X/init/main.c:1.1.1.14 Fri Jul 30 11:32:26 2004 +++ Linux-2.6.X/init/main.c Mon Aug 2 12:51:38 2004 @@ -359,8 +359,19 @@ #define smp_init() do { } while (0) #endif +unsigned long cache_decay_ticks; static inline void setup_per_cpu_areas(void) { } -static inline void smp_prepare_cpus(unsigned int maxcpus) { } +static void smp_prepare_cpus(unsigned int maxcpus) +{ + /* + * Generic 2 tick cache_decay for uniprocessor. + * FIXME - True cache decay ticks should be extracted from all the + * arch dependant SMP code to be set on UP as well. + */ + cache_decay_ticks = 2; + printk("Generic cache decay timeout: %ld msecs.\n", + (cache_decay_ticks * 1000 / HZ)); +} #else Index: Linux-2.6.X/kernel/sched.c diff -u Linux-2.6.X/kernel/sched.c:1.1.1.15 Linux-2.6.X/kernel/sched.c:1.1.1.15.2.3.2.1.2.1.2.1.4.1.2.1 --- Linux-2.6.X/kernel/sched.c:1.1.1.15 Fri Jul 30 11:31:17 2004 +++ Linux-2.6.X/kernel/sched.c Mon Aug 2 14:27:26 2004 @@ -16,6 +16,12 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2004-06-03 Single priority array, simplified interactive bonus + * mechanism, throughput bonus mechanism and + * entitlement based mode by Peter Williams + * (Courtesy of Aurema Pty Ltd, www.aurema.com) + * 2004-07-07 New staircase scheduling policy by Con Kolivas with help + * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams. */ #include @@ -45,15 +51,26 @@ #include -#ifdef CONFIG_NUMA -#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) -#else -#define cpu_to_node_mask(cpu) (cpu_online_map) +enum sched_mode_enum { + SCHED_MODE_STAIRCASE, + SCHED_MODE_PRIORITY_BASED, + SCHED_MODE_ENTITLEMENT_BASED +}; + +static enum sched_mode_enum sched_mode = SCHED_MODE_STAIRCASE; + +#ifdef CONFIG_SYSCTL +static const char *sched_mode_names[] = { + "sc", /* SCHED_MODE_STAIRCASE */ + "pb", /* SCHED_MODE_PRIORITY_BASED */ + "eb", /* SCHED_MODE_ENTITLEMENT_BASED */ + NULL /* end of list marker */ +}; #endif /* * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * to static priority [ MAX_RT_PRIO..MAX_PRIO-2 ], * and back. */ #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) @@ -63,138 +80,270 @@ /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. + * it's a [ 0 ... 40 ] range. */ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) -#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\ - (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1))) /* * Some helpers for converting nanosecond timing to jiffy resolution */ #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) + +static int sched_compute = 0; +/* + *This is the time all tasks within the same priority round robin. + *compute setting is reserved for dedicated computational scheduling + *and has ten times larger intervals. + */ +#define _RR_INTERVAL ((10 * HZ / 1000) ? : 1) +#define RR_INTERVAL() (_RR_INTERVAL * (1 + 9 * sched_compute)) /* * These are the 'tuning knobs' of the scheduler: - * - * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, - * maximum timeslice is 200 msecs. Timeslices get refilled after - * they expire. - */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define CREDIT_LIMIT 100 - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) + * Making IDLE_PRIO bigger than 159 would require modification of bitmaps + */ +#define IDLE_PRIO 159 +#define BATCH_PRIO (IDLE_PRIO - 1) +#define MAX_TOTAL_BONUS (BATCH_PRIO - MAX_PRIO) +#define MAX_MAX_IA_BONUS ((MAX_TOTAL_BONUS + 1) / 2) +#define MAX_MAX_TPT_BONUS (MAX_TOTAL_BONUS - MAX_MAX_IA_BONUS) +#define DEFAULT_MAX_IA_BONUS MAX_MAX_IA_BONUS +#define DEFAULT_MAX_TPT_BONUS ((DEFAULT_MAX_IA_BONUS) / 2) +static unsigned int max_ia_bonus = DEFAULT_MAX_IA_BONUS; +static unsigned int initial_ia_bonus = 1; +static unsigned int max_tpt_bonus = DEFAULT_MAX_TPT_BONUS; -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) +/* + * Define some mini Kalman filter for estimating various averages, etc. + * To make it more efficient the denominator of the fixed point rational + * numbers used to store the averages and the response half life will + * be chosen so that the fixed point rational number reperesentation + * of (1 - alpha) * i (where i is an integer) will be i. + * Some of this is defined in linux/sched.h + */ + +/* + * Fixed denominator rational numbers for use by the CPU scheduler + */ +#define SCHED_AVG_OFFSET 4 +/* + * Get the rounded integer value of a scheduling statistic average field + * i.e. those fields whose names begin with avg_ + */ +#define SCHED_AVG_RND(x) \ + (((x) + (1 << (SCHED_AVG_OFFSET - 1))) >> (SCHED_AVG_OFFSET)) +#define SCHED_AVG_ALPHA ((1 << SCHED_AVG_OFFSET) - 1) +#define SCHED_AVG_MUL(a, b) (((a) * (b)) >> SCHED_AVG_OFFSET) +#define SCHED_AVG_REAL(a) ((a) << SCHED_AVG_OFFSET) + +/* + * Convert nice to shares + * Proportional symmetry is aimed for: i.e. + * (nice_to_shares(0) / nice_to_shares(19)) == (nice_to_shares(-20) / nice_to_shares(0)) + * Make sure that this function is robust for variations of EB_SHARES_PER_NICE + */ +static inline unsigned int nice_to_shares(int nice) +{ + unsigned int result = DEFAULT_EB_SHARES; + + if (nice > 0) + result -= (nice * (20 * EB_SHARES_PER_NICE - 1)) / 19; + else if (nice < 0) + result += (nice * nice * ((20 * EB_SHARES_PER_NICE - 1) * EB_SHARES_PER_NICE)) / 20; + + return result; +} + +#define SCHED_IA_BONUS_OFFSET 8 +#define SCHED_IA_BONUS_ALPHA ((1 << SCHED_IA_BONUS_OFFSET) - 1) +#define SCHED_IA_BONUS_MUL(a, b) (((a) * (b)) >> SCHED_IA_BONUS_OFFSET) +/* + * Get the rounded integer value of the interactive bonus + */ +#define SCHED_IA_BONUS_RND(x) \ + (((x) + (1 << (SCHED_IA_BONUS_OFFSET - 1))) >> (SCHED_IA_BONUS_OFFSET)) + +static inline void apply_sched_avg_decay(unsigned long long *valp) +{ + *valp = SCHED_AVG_MUL(*valp, SCHED_AVG_ALPHA); +} + +static inline void update_sched_ia_bonus(struct task_struct *p, unsigned long long incr) +{ + p->interactive_bonus = SCHED_IA_BONUS_MUL(p->interactive_bonus, SCHED_IA_BONUS_ALPHA); + p->interactive_bonus += incr; +} + +static inline unsigned long long sched_div_64(unsigned long long a, unsigned long long b) +{ +#if BITS_PER_LONG < 64 + /* + * Assume that there's no 64 bit divide available + */ + if (a < b) + return 0; + /* + * Scale down until b less than 32 bits so that we can do + * a divide using do_div() + */ + while (b > ULONG_MAX) { a >>= 1; b >>= 1; } + + (void)do_div(a, (unsigned long)b); + + return a; #else -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) + return a / b; +#endif +} + +#define PROPORTION_OFFSET 32 +#if PROPORTION_OFFSET > 32 +#error "PROPORTION_OFFSET must be less than or equal to 32" #endif +#define PROPORTION_ONE ((unsigned long long)1 << PROPORTION_OFFSET) +#define PROPORTION_OVERFLOW (((unsigned long long)1 << (64 - PROPORTION_OFFSET)) - 1) +#define PROP_FM_PPT(a) (((unsigned long long)(a) * PROPORTION_ONE) / 1000) +/* + * Convert a / b to a proportion in the range 0 to PROPORTION_ONE + * Requires a <= b or may get a divide by zero exception + */ +static inline unsigned long long calc_proportion(unsigned long long a, unsigned long long b) +{ + if (unlikely(a == b)) + return PROPORTION_ONE; -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) + while (a > PROPORTION_OVERFLOW) { a >>= 1; b >>= 1; } -#define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) + return sched_div_64(a << PROPORTION_OFFSET, b); +} -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) +/* + * Map the given proportion to an unsigned long long in the specified range + * Requires range < PROPORTION_ONE to avoid overflow + */ +static inline unsigned long long map_proportion(unsigned long long prop, unsigned long long range) +{ + return (prop * range) >> PROPORTION_OFFSET; +} -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) +static inline unsigned long long map_proportion_rnd(unsigned long long prop, unsigned long long range) +{ + return map_proportion((prop >> 1), (range * 2 + 1)); +} -#define HIGH_CREDIT(p) \ - ((p)->interactive_credit > CREDIT_LIMIT) +/* + * Find the square root of a proportion + * Require: x <= PROPORTION_ONE + */ +static unsigned long long proportion_sqrt(unsigned long long x) +{ + unsigned long long res, b; + int bshift; + + /* + * Take shortcut AND prevent overflow + */ + if (x == PROPORTION_ONE) + return PROPORTION_ONE; -#define LOW_CREDIT(p) \ - ((p)->interactive_credit < -CREDIT_LIMIT) + res = 0; + b = (1UL << (PROPORTION_OFFSET - 1)); + bshift = PROPORTION_OFFSET - 1; + x <<= PROPORTION_OFFSET; + + for (; x && b; b >>= 1, bshift--) { + unsigned long long temp = (((res << 1) + b) << bshift); + + if (x >= temp) { + res += b; + x -= temp; + } + } -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) + return res; +} /* - * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] - * to time slice values. - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - * - * task_timeslice() is the interface that is used by the scheduler. + * Tasks that have a CPU usage rate greater than this threshold (in parts per + * thousand) are considered to be CPU bound and start to lose interactive bonus + * points + */ +#define DEFAULT_CPU_HOG_THRESHOLD 900 +static unsigned int cpu_hog_threshold_ppt = DEFAULT_CPU_HOG_THRESHOLD; +static unsigned long long cpu_hog_threshold = PROP_FM_PPT(DEFAULT_CPU_HOG_THRESHOLD); + +/* + * Tasks that would sleep for more than 900 parts per thousand of the time if + * they had the CPU to themselves are considered to be interactive provided + * that their average sleep duration per scheduling cycle isn't too long */ +#define DEFAULT_IA_THRESHOLD 900 +static unsigned int ia_threshold_ppt = DEFAULT_IA_THRESHOLD; +static unsigned long long ia_threshold = PROP_FM_PPT(DEFAULT_IA_THRESHOLD); +#define LOWER_MAX_IA_SLEEP SCHED_AVG_REAL(15 * 60LL * NSEC_PER_SEC) +#define UPPER_MAX_IA_SLEEP SCHED_AVG_REAL(2 * 60 * 60LL * NSEC_PER_SEC) -#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ - ((MAX_TIMESLICE - MIN_TIMESLICE) * \ - (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) +/* + * For the "bp" scheduler: + * SCHED_ISO tasks that have a CPU usage rate less than this threshold (in parts per + * thousand) are treated as if they had a nice vale of -20 + */ +#define DEFAULT_SCHED_ISO_THRESHOLD 50 +static unsigned int sched_iso_threshold_ppt = DEFAULT_SCHED_ISO_THRESHOLD; +static unsigned long long sched_iso_threshold = PROP_FM_PPT(DEFAULT_SCHED_ISO_THRESHOLD); + +/* + * What "base time slice" for nice 0 and "average time slice" evaluated to + */ +#define MSECS_TO_JIFFIES(x) (((x) * (HZ * 2 + 1)) / 2000) +#define MSECS_TO_JIFFIES_MIN_1(x) (MSECS_TO_JIFFIES(x) ? MSECS_TO_JIFFIES(x) : 1) +#define DEFAULT_TIME_SLICE_MSECS 100 +#define MAX_TIME_SLICE_MSECS 1000 +#define DEFAULT_TIME_SLICE_TICKS MSECS_TO_JIFFIES_MIN_1(DEFAULT_TIME_SLICE_MSECS) + +static unsigned int time_slice_ticks = DEFAULT_TIME_SLICE_TICKS; +static unsigned int sched_rr_time_slice_ticks = DEFAULT_TIME_SLICE_TICKS; +static unsigned int sched_batch_time_slice_multiplier = 10; -static unsigned int task_timeslice(task_t *p) +static inline unsigned int task_timeslice(const task_t *p) { - return BASE_TIMESLICE(p); + if (unlikely(p->policy == SCHED_RR)) + return sched_rr_time_slice_ticks; + + if (unlikely(batch_task(p) && !(p->flags & PF_UISLEEP))) + return time_slice_ticks * sched_batch_time_slice_multiplier; + + return time_slice_ticks; } -#define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) +#define task_hot(p, sd) ((p)->rq->timestamp_last_tick - (p)->timestamp < (sd)->cache_hot_time) /* * These are the runqueue data structures: */ +#define NUM_PRIO_SLOTS (IDLE_PRIO + 1) -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) +/* + * Is the run queue idle? + */ +#define RUNQUEUE_IDLE(rq) ((rq)->curr == (rq)->idle) -typedef struct runqueue runqueue_t; +/* + * Control values for niceness + */ +#define PROSPECTIVE_BASE_PROM_INTERVAL_MSECS ((DEFAULT_TIME_SLICE_MSECS * 255) / 100) +#if (PROSPECTIVE_BASE_PROM_INTERVAL_MSECS > 0) +#define BASE_PROM_INTERVAL_MSECS PROSPECTIVE_BASE_PROM_INTERVAL_MSECS +#else +#define BASE_PROM_INTERVAL_MSECS DEFAULT_TIME_SLICE_MSECS +#endif +static unsigned int base_prom_interval_ticks = MSECS_TO_JIFFIES_MIN_1(BASE_PROM_INTERVAL_MSECS); -struct prio_array { - unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; +struct prio_slot { + unsigned int prio; + struct list_head queue; }; /* @@ -215,15 +364,23 @@ #ifdef CONFIG_SMP unsigned long cpu_load; #endif + unsigned long avg_nr_running; unsigned long long nr_switches; - unsigned long expired_timestamp, nr_uninterruptible; + unsigned long nr_uninterruptible; unsigned long long timestamp_last_tick; + unsigned long long total_delay; + unsigned int cache_ticks, preempted; task_t *curr, *idle; struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; - int best_expired_prio; + DECLARE_BITMAP(bitmap, NUM_PRIO_SLOTS); + struct prio_slot queues[NUM_PRIO_SLOTS]; + struct prio_slot *current_prio_slot; + unsigned long next_prom_due; atomic_t nr_iowait; + unsigned long long eb_yardstick; + unsigned long long eb_ticks_to_decay; + #ifdef CONFIG_SMP struct sched_domain *sd; @@ -243,47 +400,153 @@ #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) -#define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define is_idle_task(p) ((p) == (p)->rq->idle) + +#ifdef CONFIG_SMP +void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + BUG_ON(!list_empty(&p->run_list)); + + p->thread_info->cpu = cpu; + p->rq = cpu_rq(cpu); +} + +/* + * "p"'s runqueue and "oldrq" are locked when this is called + */ +static inline void adjust_timestamp(task_t *p, const runqueue_t *oldrq) +{ + p->timestamp += (p->rq->timestamp_last_tick - oldrq->timestamp_last_tick); +} + +/* + * adjust_sched_timestamp() is always called with p's runqueue locked but sometimes + * "oldrq" isn't locked and isn't "this_rq()" (e.g. in try_to_wake_up()) + * leading to possible (very rare) problems on systems where 64 bit reads are + * not atomic. + * + * We'll handle this problem by reading their "timestamp_last_tick"s until we + * get two the same. + */ +static inline void adjust_sched_timestamp(task_t *p, const runqueue_t *oldrq) +{ + unsigned long long oldrq_tlt = oldrq->timestamp_last_tick; + + if (oldrq != this_rq()) + while (unlikely(oldrq_tlt != oldrq->timestamp_last_tick)) + oldrq_tlt = oldrq->timestamp_last_tick; + + p->sched_timestamp += p->rq->timestamp_last_tick - oldrq_tlt; +} + +/* + * for use when the task may be on another CPU (to compensate for drift) + * + * This is only ever called when "p"'s runqueue is locked. + * Even though "this_rq()" may not be locked this should be safe as + * "timestamp_last_tick" is only ever changed by tasks running on the same CPU + * and so it won't be being changed while we read it. + */ +static inline unsigned long long adjusted_sched_clock(const task_t *p) +{ + runqueue_t *trq = this_rq(); + + return sched_clock() + (p->rq->timestamp_last_tick - trq->timestamp_last_tick); +} + +#else +static inline void adjust_timestamp(task_t *p, const runqueue_t *oldrq) +{ +} +#define adjusted_sched_clock(p) sched_clock() +#endif + +static inline void set_rq_current_prio(runqueue_t *rq, int prio) +{ + rq->current_prio_slot = rq->queues + prio; +} + +static inline int get_rq_current_prio(runqueue_t *rq) +{ + return rq->current_prio_slot->prio; +} + +static inline void set_rq_current_task(struct task_struct *p) +{ + p->rq->curr = p; + p->rq->current_prio_slot = p->rq->queues + p->prio; +} + /* * Default context-switch locking: */ #ifndef prepare_arch_switch # define prepare_arch_switch(rq, next) do { } while (0) # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -# define task_running(rq, p) ((rq)->curr == (p)) +# define task_is_running(p) ((p)->rq->curr == (p)) +#else +# define task_is_running(p) task_running((p)->rq, p) #endif +static inline unsigned long get_prom_interval(const struct runqueue *rq) +{ + if (rq->nr_running < 2) + return base_prom_interval_ticks; + return (rq->nr_running - 1) * base_prom_interval_ticks; +} + +static inline void decay_eb_yardstick(runqueue_t *rq) +{ + static const unsigned long long decay_per_interval = PROP_FM_PPT(990); + + rq->eb_yardstick = map_proportion(decay_per_interval, rq->eb_yardstick); + rq->eb_ticks_to_decay = time_slice_ticks; +} + +#define EB_PAR 19 +static inline void set_eb_yardstick(task_t *p) +{ + p->rq->eb_yardstick = p->cpu_usage_rate_per_share; + p->eb_priority = MAX_RT_PRIO + EB_PAR; + p->rq->eb_ticks_to_decay = time_slice_ticks; +} + +static inline int task_should_be_yardstick(const task_t *p) +{ + return (p->cpu_usage_rate_per_share > p->rq->eb_yardstick); +} + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ -static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +static spinlock_t *task_rq_lock(const task_t *p, unsigned long *flags) { - struct runqueue *rq; + spinlock_t *rql; repeat_lock_task: local_irq_save(*flags); - rq = task_rq(p); - spin_lock(&rq->lock); - if (unlikely(rq != task_rq(p))) { - spin_unlock_irqrestore(&rq->lock, *flags); + rql = &p->rq->lock; + spin_lock(rql); + if (unlikely(rql != &p->rq->lock)) { + spin_unlock_irqrestore(rql, *flags); goto repeat_lock_task; } - return rq; + return rql; } -static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) +static inline void task_rq_unlock(spinlock_t *rql, unsigned long *flags) { - spin_unlock_irqrestore(&rq->lock, *flags); + spin_unlock_irqrestore(rql, *flags); } /* * rq_lock - lock a given runqueue and disable interrupts. */ -static runqueue_t *this_rq_lock(void) +static spinlock_t *this_rq_lock(void) { runqueue_t *rq; @@ -291,31 +554,67 @@ rq = this_rq(); spin_lock(&rq->lock); - return rq; + return &rq->lock; } -static inline void rq_unlock(runqueue_t *rq) +static int rr_interval(const task_t * p) { - spin_unlock_irq(&rq->lock); + int rr_interval = RR_INTERVAL(); + if (batch_task(p)) + rr_interval *= sched_batch_time_slice_multiplier; + else if (iso_task(p)) + rr_interval /= 2; + if (!rr_interval) + rr_interval = 1; + return rr_interval; +} + +static inline int task_preempts_curr(const struct task_struct *p) +{ + if (p->prio < get_rq_current_prio(p->rq)) { + if (rt_task(p) || !sched_compute || + p->rq->cache_ticks >= cache_decay_ticks || + (batch_task(p->rq->curr) && !batch_task(p))) + return 1; + p->rq->preempted = 1; + } + + return 0; +} + +static inline int task_queued(const task_t *task) +{ + return !list_empty(&task->run_list); } /* - * Adding/removing a task to/from a priority array: + * Adding/removing a task to/from a runqueue: */ -static void dequeue_task(struct task_struct *p, prio_array_t *array) +static void dequeue_task(struct task_struct *p) { - array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); + /* + * If p is the last task in this priority slot then slotp will be + * a pointer to the head of the list in the sunqueue structure + * NB we can't use p->prio for bitmap as task may have been + * promoted + */ + struct list_head *slotp = p->run_list.next; + + /* + * Initialize after removal from the list so that list_empty() works + * as a means for testing whether the task is runnable + */ + list_del_init(&p->run_list); + if (list_empty(slotp)) + __clear_bit(list_entry(slotp, struct prio_slot, queue)->prio, p->rq->bitmap); } -static void enqueue_task(struct task_struct *p, prio_array_t *array) +static void enqueue_task(struct task_struct *p) { - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + list_add_tail(&p->run_list, &p->rq->queues[p->prio].queue); + __set_bit(p->prio, p->rq->bitmap); + if (p == p->rq->curr) + set_rq_current_prio(p->rq, p->prio); } /* @@ -323,196 +622,470 @@ * remote queue so we want these tasks to show up at the head of the * local queue: */ -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +static inline void enqueue_task_head(struct task_struct *p) { - list_add(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + list_add(&p->run_list, &p->rq->queues[p->prio].queue); + __set_bit(p->prio, p->rq->bitmap); + if (p == p->rq->curr) + set_rq_current_prio(p->rq, p->prio); } /* - * effective_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. - * - * Both properties are important to certain workloads. + * __activate_task - move a task to the runqueue. */ -static int effective_prio(task_t *p) +static inline void __activate_task(task_t *p) { - int bonus, prio; - - if (rt_task(p)) - return p->prio; - - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; - - prio = p->static_prio - bonus; - if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; - return prio; + enqueue_task(p); + p->rq->nr_running++; } /* - * __activate_task - move a task to the runqueue. + * activate task on the _front_ of runqueue. */ -static inline void __activate_task(task_t *p, runqueue_t *rq) +static inline void __activate_task_head(task_t *p) { - enqueue_task(p, rq->active); - rq->nr_running++; + enqueue_task_head(p); + p->rq->nr_running++; } /* - * __activate_idle_task - move idle task to the _front_ of runqueue. + * burst - extra intervals an interactive task can run for at best priority + * instead of descending priorities. */ -static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +static unsigned int burst(const task_t *p) { - enqueue_task_head(p, rq->active); - rq->nr_running++; + if (likely(!rt_task(p))) { + unsigned int task_user_prio = TASK_USER_PRIO(p); + if (iso_task(p)) + task_user_prio /= 2; + return 39 - task_user_prio; + } else + return p->burst; } -static void recalc_task_prio(task_t *p, unsigned long long now) +static void inc_burst(task_t *p) { - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; + unsigned int best_burst; + best_burst = burst(p); + if (p->burst < best_burst) + p->burst++; +} - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; +static void dec_burst(task_t *p) +{ + if (p->burst) + p->burst--; +} - if (likely(sleep_time > 0)) { - /* - * User tasks that sleep a long time are categorised as - * idle and will get just interactive status to stay active & - * prevent them suddenly becoming cpu hogs and starving - * other processes. - */ - if (p->mm && p->activated != -1 && - sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - - AVG_TIMESLICE); - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } else { - /* - * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. - */ - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; +/* + * slice - the duration a task runs before getting requeued at it's best + * priority and has it's burst decremented. + */ +static unsigned int slice(const task_t *p) +{ + unsigned int slice = RR_INTERVAL(); + if (likely(!rt_task(p) && !batch_task(p))) + slice += burst(p) * RR_INTERVAL(); + else if (batch_task(p)) + slice *= 40 - TASK_USER_PRIO(p); + return slice; +} - /* - * Tasks with low interactive_credit are limited to - * one timeslice worth of sleep avg bonus. - */ - if (LOW_CREDIT(p) && - sleep_time > JIFFIES_TO_NS(task_timeslice(p))) - sleep_time = JIFFIES_TO_NS(task_timeslice(p)); +static int hog_sub_cycle_threshold = 10; - /* - * Non high_credit tasks waking from uninterruptible - * sleep are limited in their sleep_avg rise as they - * are likely to be cpu hogs waiting on I/O - */ - if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); - sleep_time = 0; - } - } +/* + * Calculate CPU usage rate and sleepiness. + * This never gets called on real time tasks + */ +static void calculate_rates(task_t *p) +{ + unsigned long long bl = p->avg_sleep_per_cycle + p->avg_cpu_per_cycle; + /* + * Take a shortcut and avoid possible divide by zero later + */ + if (unlikely(bl == 0)) { + p->sleepiness = PROPORTION_ONE; + p->cpu_usage_rate = 0; + p->cpu_usage_rate_per_share = 0; + } else { + p->sleepiness = calc_proportion(p->avg_sleep_per_cycle, bl); + if (unlikely(p->sub_cycle_count > hog_sub_cycle_threshold)) { /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. + * Take a shortcut and avoid possible divide by zero later */ - p->sleep_avg += sleep_time; + if (p->avg_delay_per_sub_cycle == 0) + p->cpu_usage_rate = PROPORTION_ONE; + else { + unsigned long long sbl; - if (p->sleep_avg > NS_MAX_SLEEP_AVG) { - p->sleep_avg = NS_MAX_SLEEP_AVG; - if (!HIGH_CREDIT(p)) - p->interactive_credit++; + sbl = p->avg_delay_per_sub_cycle + p->avg_cpu_per_sub_cycle; + p->cpu_usage_rate = calc_proportion(p->avg_cpu_per_sub_cycle, sbl); } + } else { + bl += p->avg_delay_per_cycle; + p->cpu_usage_rate = calc_proportion(p->avg_cpu_per_cycle, bl); } + p->cpu_usage_rate_per_share = sched_div_64(p->cpu_usage_rate, p->eb_shares); } +} - p->prio = effective_prio(p); +/* + * Calculate entitlement based priority. + * This never gets called on real time tasks + */ +static void calculate_eb_priority(task_t *p) +{ + /* + * Prevent possible divide by zero and take shortcut + */ + if (unlikely(p->cpu_usage_rate_per_share == 0)) { + p->eb_priority = MAX_RT_PRIO; + } else if (unlikely(p->cpu_usage_rate_per_share > p->rq->eb_yardstick)) { + unsigned long long prop = calc_proportion(p->rq->eb_yardstick, p->cpu_usage_rate_per_share); + + p->eb_priority = MAX_PRIO - map_proportion_rnd(prop, EB_PAR + 1); + } else { + unsigned long long prop = calc_proportion(p->cpu_usage_rate_per_share, p->rq->eb_yardstick); + + p->eb_priority = MAX_RT_PRIO + map_proportion_rnd(prop, EB_PAR); + } } /* - * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) + * Initialize the scheduling statistics counters */ -static void activate_task(task_t *p, runqueue_t *rq, int local) +static inline void initialize_stats(task_t *p) { - unsigned long long now; + p->avg_sleep_per_cycle = 0; + p->avg_delay_per_cycle = 0; + p->avg_delay_per_sub_cycle = 0; + p->avg_cpu_per_cycle = 0; + p->avg_cpu_per_sub_cycle = 0; + p->total_sleep = 0; + p->total_delay = 0; + p->total_cpu = 0; + p->cycle_count = 0; + p->intr_wake_ups = 0; + p->sched_timestamp = sched_clock(); +} - now = sched_clock(); -#ifdef CONFIG_SMP - if (!local) { - /* Compensate for drifting sched_clock */ - runqueue_t *this_rq = this_rq(); - now = (now - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; +static inline void delta_sleep_stats(task_t *p, unsigned long long now) +{ + unsigned long long delta = now - p->sched_timestamp; + + p->sched_timestamp = now; + p->avg_sleep_per_cycle += delta; + p->total_sleep += delta; +} + +static inline void delta_cpu_stats(task_t *p, unsigned long long now) +{ + unsigned long long delta = now - p->sched_timestamp; + + p->sched_timestamp = now; + p->avg_cpu_per_cycle += delta; + p->avg_cpu_per_sub_cycle += delta; + p->total_cpu += delta; +} + +static inline void delta_delay_stats(task_t *p, unsigned long long now) +{ + unsigned long long delta = now - p->sched_timestamp; + + p->sched_timestamp = now; + p->avg_delay_per_cycle += delta; + p->avg_delay_per_sub_cycle += delta; + p->total_delay += delta; + p->rq->total_delay += delta; +} + +/* + * Update various statistics for the end of a + * ((on_run_queue :-> on_cpu)* :-> sleep) cycle. + * We can't just do this in activate_task() as every invocation of that + * function is not the genuine end of a cycle. + */ +static void update_stats_for_cycle(task_t *p) +{ + unsigned long long now = adjusted_sched_clock(p); + + delta_sleep_stats(p, now); + if (in_interrupt()) + p->intr_wake_ups++; + p->cycle_count++; + p->sub_cycle_count = 0; + if (!rt_task(p)) { + /* we con't care about these for real time tasks + * + * Do this after delta_sleep so that averages for all measures + * are for the current cycle + */ + apply_sched_avg_decay(&p->avg_sleep_per_cycle); + apply_sched_avg_decay(&p->avg_delay_per_cycle); + apply_sched_avg_decay(&p->avg_cpu_per_cycle); + apply_sched_avg_decay(&p->avg_delay_per_sub_cycle); + apply_sched_avg_decay(&p->avg_cpu_per_sub_cycle); + if (sched_mode != SCHED_MODE_STAIRCASE) + calculate_rates(p); } -#endif +} - recalc_task_prio(p, now); +/* + * Check whether a task with an interactive bonus still qualifies and if not + * decrease its bonus + * This never gets called on real time tasks + */ +static void reassess_cpu_boundness(task_t *p) +{ + /* + * No point going any further if there's no bonus to lose + */ + if (p->interactive_bonus == 0) + return; + + if (p->cpu_usage_rate > cpu_hog_threshold) + update_sched_ia_bonus(p, 0); +} +/* + * Check whether a task qualifies for an interactive bonus and if it does + * increase its bonus + * This never gets called on real time tasks + */ +static void reassess_interactiveness(task_t *p) +{ /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. + * No sleep means not interactive (in most cases), but */ - if (!p->activated) { + if (p->avg_sleep_per_cycle > LOWER_MAX_IA_SLEEP) { /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: + * Really long sleeps mean it's probably not interactive */ - if (in_interrupt()) - p->activated = 2; - else { + if (p->avg_sleep_per_cycle > UPPER_MAX_IA_SLEEP) + update_sched_ia_bonus(p, 0); + return; + } + if (p->sleepiness > ia_threshold) + update_sched_ia_bonus(p, p->sleepiness); + else if (p->sub_cycle_count == 0) + reassess_cpu_boundness(p); +} + +/* + * Check whether a task qualifies for a throughput bonus and if it does + * give it one + * This never gets called on real time tasks + */ +static inline unsigned long long tpt_bonus_proportion(unsigned long long delay, + unsigned long long cpu, unsigned long load) +{ + unsigned long long expected_delay; + + if (load <= (1 << SCHED_AVG_OFFSET)) + expected_delay = 0; + else + expected_delay = SCHED_AVG_MUL(cpu, (load - (1 << SCHED_AVG_OFFSET))); + + /* + * No unexpected delay means no bonus, but + * NB this test also avoids a possible divide by zero error if + * cpu is also zero and negative bonuses + */ + if (delay <= expected_delay) + return 0; + + delay -= expected_delay; + + return calc_proportion(delay, delay + cpu); +} + +static void recalc_throughput_bonus(task_t *p) +{ + unsigned long long ratio; + + if (unlikely(p->sub_cycle_count > hog_sub_cycle_threshold)) + ratio = tpt_bonus_proportion(p->avg_delay_per_sub_cycle, + p->avg_cpu_per_sub_cycle, p->rq->avg_nr_running); + else + ratio = tpt_bonus_proportion(p->avg_delay_per_cycle, + p->avg_cpu_per_cycle, p->rq->avg_nr_running); + + p->throughput_bonus = proportion_sqrt(ratio); +} + +/* + * sched_interactive - sysctl which allows interactive tasks to have bursts + */ +int sched_interactive = 0; + +/* + * "pb" + * effective_prio - return the priority that is based on the static + * priority but is modified by bonuses/penalties. + * "sc" + * effective_prio - dynamic priority dependent on burst. + * The priority normally decreases by one each RR_INTERVAL. + * As the burst increases the priority stays at the top "stair" or + * priority for longer. + */ +static int effective_prio(task_t *p) +{ + int prio, rr; + unsigned int full_slice, used_slice, first_slice; + unsigned int best_burst; + unsigned int miabl, mtpbl, bonus_factor; + + if (rt_task(p)) + return p->prio; + + switch (sched_mode) { + case SCHED_MODE_STAIRCASE: + goto staircase_prio; + case SCHED_MODE_ENTITLEMENT_BASED: + prio = p->eb_priority; + break; + default: + prio = p->static_prio; + } + + if (unlikely(batch_task(p) && !(p->flags & PF_UISLEEP))) + return BATCH_PRIO; + + /* + * kernel threads get maximum bonuses + */ + if (p->mm == NULL) + return prio; + + miabl = max_ia_bonus; + mtpbl = max_tpt_bonus; + bonus_factor = (miabl + mtpbl); + bonus_factor -= map_proportion_rnd(SCHED_IA_BONUS_RND(p->interactive_bonus), miabl); + bonus_factor -= map_proportion_rnd(p->throughput_bonus, mtpbl); + + if (iso_task(p) && likely(p->cpu_usage_rate < sched_iso_threshold)) + return MAX_RT_PRIO + bonus_factor; + + return prio + bonus_factor; + +staircase_prio: + if (batch_task(p)) { + if (unlikely(p->flags & PF_UISLEEP)) { /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: + * If batch is waking up from uninterruptible sleep + * reschedule at a normal priority to begin with. */ - p->activated = 1; + p->flags |= PF_YIELDED; + return MAX_PRIO - 2; + } + return BATCH_PRIO; + } + + best_burst = burst(p); + full_slice = slice(p); + rr = rr_interval(p); + used_slice = full_slice - p->slice; + if (p->burst > best_burst) + p->burst = best_burst; + first_slice = rr; + if (sched_interactive && !sched_compute && !iso_task(p)) + first_slice *= (p->burst + 1); + prio = MAX_PRIO - 2 - best_burst; + + if (used_slice < first_slice) + return prio; + prio += 1 + (used_slice - first_slice) / rr; + if (prio > MAX_PRIO - 2) + prio = MAX_PRIO - 2; + return prio; +} + +/* + * recalc_task_prio - this checks for tasks that run ultra short timeslices + * or have just forked a thread/process and make them continue their old + * slice instead of starting a new one at high priority. + */ +static void recalc_task_prio(task_t *p, unsigned long long now) +{ + unsigned long sleep_time; + unsigned long ns_totalrun; + unsigned long total_run; + + if (sched_mode == SCHED_MODE_STAIRCASE) + goto recalc_staircase; + + /* + * Throughput bonus is dependent on how busy the CPU is so do it here to + * catch any CPU changes + */ + if (!rt_task(p)) { + recalc_throughput_bonus(p); + if (sched_mode == SCHED_MODE_ENTITLEMENT_BASED) { + if (task_should_be_yardstick(p)) + set_eb_yardstick(p); + calculate_eb_priority(p); } } + p->prio = effective_prio(p); + return; + +recalc_staircase: + sleep_time = now - p->timestamp; + ns_totalrun = p->totalrun + p->runtime; + total_run = NS_TO_JIFFIES(ns_totalrun); + p->slice = slice(p); + if (p->flags & PF_FORKED || ((!(NS_TO_JIFFIES(p->runtime)) || + !sched_interactive || sched_compute) && + NS_TO_JIFFIES(p->runtime + sleep_time) < rr_interval(p))) { + p->flags &= ~PF_FORKED; + if (p->slice - total_run < 1) { + p->totalrun = 0; + dec_burst(p); + } else { + p->totalrun = ns_totalrun; + p->slice -= total_run; + } + } else { + if (!(p->flags & PF_UISLEEP)) + inc_burst(p); + p->runtime = 0; + p->totalrun = 0; + } + p->prio = effective_prio(p); + p->time_slice = rr_interval(p); + if (batch_task(p)) + p->time_slice = p->slice; +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + */ +static void activate_task(task_t *p) +{ + /* Compensate for drifting sched_clock */ + unsigned long long now = adjusted_sched_clock(p); + + recalc_task_prio(p, now); p->timestamp = now; + if ((sched_mode != SCHED_MODE_STAIRCASE) || unlikely(rt_task(p))) + p->time_slice = task_timeslice(p); + p->flags &= ~PF_UISLEEP; - __activate_task(p, rq); + __activate_task(p); } /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p, runqueue_t *rq) +static void deactivate_task(struct task_struct *p) { - rq->nr_running--; - if (p->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible++; - dequeue_task(p, p->array); - p->array = NULL; + p->rq->nr_running--; + if (p->state == TASK_UNINTERRUPTIBLE) { + p->flags |= PF_UISLEEP; + p->rq->nr_uninterruptible++; + } + dequeue_task(p); } /* @@ -544,13 +1117,19 @@ } #endif +static inline void preempt_curr_if_warranted(struct task_struct *p) +{ + if (task_preempts_curr(p)) + resched_task(p->rq->curr); +} + /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. */ inline int task_curr(const task_t *p) { - return cpu_curr(task_cpu(p)) == p; + return p->rq->curr == p; } #ifdef CONFIG_SMP @@ -579,13 +1158,11 @@ */ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) { - runqueue_t *rq = task_rq(p); - /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!task_queued(p) && !task_is_running(p)) { set_task_cpu(p, dest_cpu); return 0; } @@ -594,7 +1171,7 @@ req->type = REQ_MOVE_TASK; req->task = p; req->dest_cpu = dest_cpu; - list_add(&req->list, &rq->migration_queue); + list_add(&req->list, &p->rq->migration_queue); return 1; } @@ -610,22 +1187,22 @@ void wait_task_inactive(task_t * p) { unsigned long flags; - runqueue_t *rq; + spinlock_t *rql; int preempted; repeat: - rq = task_rq_lock(p, &flags); + rql = task_rq_lock(p, &flags); /* Must be off runqueue entirely, not preempted. */ - if (unlikely(p->array)) { + if (unlikely(task_queued(p))) { /* If it's preempted, we yield. It could be a while. */ - preempted = !task_running(rq, p); - task_rq_unlock(rq, &flags); + preempted = !task_is_running(p); + task_rq_unlock(rql, &flags); cpu_relax(); if (preempted) yield(); goto repeat; } - task_rq_unlock(rq, &flags); + task_rq_unlock(rql, &flags); } /*** @@ -733,26 +1310,33 @@ int cpu, this_cpu, success = 0; unsigned long flags; long old_state; - runqueue_t *rq; + spinlock_t *rql; + runqueue_t *old_rq; #ifdef CONFIG_SMP unsigned long load, this_load; struct sched_domain *sd; int new_cpu; #endif - rq = task_rq_lock(p, &flags); + rql = task_rq_lock(p, &flags); + old_rq = p->rq; old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (task_queued(p)) goto out_running; + /* + * This is the end of one scheduling cycle and the start of the next + */ + update_stats_for_cycle(p); + cpu = task_cpu(p); this_cpu = smp_processor_id(); #ifdef CONFIG_SMP - if (unlikely(task_running(rq, p))) + if (unlikely(task_is_running(p))) goto out_activate; new_cpu = cpu; @@ -789,7 +1373,7 @@ imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; if ( ((sd->flags & SD_WAKE_AFFINE) && - !task_hot(p, rq->timestamp_last_tick, sd)) + !task_hot(p, sd)) || ((sd->flags & SD_WAKE_BALANCE) && imbalance*this_load <= 100*load) ) { /* @@ -806,13 +1390,14 @@ new_cpu = wake_idle(new_cpu, p); if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { set_task_cpu(p, new_cpu); - task_rq_unlock(rq, &flags); + task_rq_unlock(rql, &flags); /* might preempt at this point */ - rq = task_rq_lock(p, &flags); + rql = task_rq_lock(p, &flags); + adjust_sched_timestamp(p, old_rq); old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (task_queued(p)) goto out_running; this_cpu = smp_processor_id(); @@ -821,16 +1406,17 @@ out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { - rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->activated = -1; - } + if (old_state == TASK_UNINTERRUPTIBLE) + old_rq->nr_uninterruptible--; /* + * Do this here rather than in activate_task() because activate() gets + * called at times when thes calculations are unnecessary e.g. for a + * change of CPU + */ + if (!rt_task(p) && (sched_mode != SCHED_MODE_STAIRCASE)) + reassess_interactiveness(p); + /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) * don't trigger a preemption, if the woken up task will run on @@ -838,17 +1424,15 @@ * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - activate_task(p, rq, cpu == this_cpu); - if (!sync || cpu != this_cpu) { - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } + activate_task(p); + if (!sync || cpu != this_cpu) + preempt_curr_if_warranted(p); success = 1; out_running: p->state = TASK_RUNNING; out: - task_rq_unlock(rq, &flags); + task_rq_unlock(rql, &flags); return success; } @@ -867,11 +1451,22 @@ } #ifdef CONFIG_SMP -static int find_idlest_cpu(struct task_struct *p, int this_cpu, +static int find_idlest_cpu(const struct task_struct *p, int this_cpu, struct sched_domain *sd); #endif /* + * Initialize the scheduling bonuses + */ +static inline void initialize_bonuses(task_t *p) +{ + p->interactive_bonus = (max_ia_bonus >= initial_ia_bonus) ? + initial_ia_bonus : max_ia_bonus; + p->throughput_bonus = 0; + p->sub_cycle_count = 0; +} + +/* * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ @@ -885,7 +1480,6 @@ */ p->state = TASK_RUNNING; INIT_LIST_HEAD(&p->run_list); - p->array = NULL; spin_lock_init(&p->switch_lock); #ifdef CONFIG_PREEMPT /* @@ -897,32 +1491,16 @@ p->thread_info->preempt_count = 1; #endif /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. + * Give the child a new timeslice */ - local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; + if ((sched_mode != SCHED_MODE_STAIRCASE) || unlikely(rt_task(p))) + p->time_slice = task_timeslice(p); + p->timestamp = sched_clock(); /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. + * Initialize the scheduling statistics and bonus counters */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { - /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. - */ - current->time_slice = 1; - preempt_disable(); - scheduler_tick(0, 0); - local_irq_enable(); - preempt_enable(); - } else - local_irq_enable(); + initialize_stats(p); + initialize_bonuses(p); } /* @@ -936,26 +1514,17 @@ { unsigned long flags; int this_cpu, cpu; - runqueue_t *rq; + spinlock_t *rql; - rq = task_rq_lock(p, &flags); + rql = task_rq_lock(p, &flags); cpu = task_cpu(p); this_cpu = smp_processor_id(); - BUG_ON(p->state != TASK_RUNNING); - - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. The parent - * (current) is done further down, under its lock. + /* + * Forked process gets no burst to prevent fork bombs. */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->interactive_credit = 0; - - p->prio = effective_prio(p); + p->burst = 0; + BUG_ON(p->state != TASK_RUNNING); if (likely(cpu == this_cpu)) { if (!(clone_flags & CLONE_VM)) { @@ -963,75 +1532,64 @@ * The VM isn't cloned, so we're in a good position to * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. + * + * Now that the idle task is back on the run queue + * we need extra care to make sure that its one and + * only fork() doesn't end up in the idle priority slot. + * Just testing for empty run list is no longer adequate. */ - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; + if (unlikely(!task_queued(current) || RUNQUEUE_IDLE(current->rq))) { + p->prio = effective_prio(p); + __activate_task(p); + } else { + /* + * Put the child on the same list(s) as (but + * ahead of) the parent. We can't use + * current->prio as current may have been promoted + */ + p->prio = get_rq_current_prio(current->rq); list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - rq->nr_running++; + current->rq->nr_running++; } set_need_resched(); - } else + } else { /* Run child last */ - __activate_task(p, rq); + p->prio = effective_prio(p); + __activate_task(p); + } } else { runqueue_t *this_rq = cpu_rq(this_cpu); + p->prio = effective_prio(p); /* * Not the local CPU - must adjust timestamp. This should * get optimised away in the !CONFIG_SMP case. */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; - __activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - } - - if (unlikely(cpu != this_cpu)) { - task_rq_unlock(rq, &flags); - rq = task_rq_lock(current, &flags); + adjust_timestamp(p, this_rq); + __activate_task(p); + preempt_curr_if_warranted(p); } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - task_rq_unlock(rq, &flags); + current->flags |= PF_FORKED; + task_rq_unlock(rql, &flags); } -/* - * Potentially available exiting-child timeslices are - * retrieved here - this way the parent does not get - * penalized for creating too many threads. - * - * (this cannot be used to 'generate' timeslices - * artificially, because any timeslice recovered here - * was given away by the parent in the first place.) +/** + * (Optionally) log scheduler statistics at exit. */ +static int log_at_exit = 0; void fastcall sched_exit(task_t * p) { - unsigned long flags; - runqueue_t *rq; + struct task_sched_stats stats; - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > MAX_TIMESLICE)) - p->parent->time_slice = MAX_TIMESLICE; - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); - task_rq_unlock(rq, &flags); + if (!log_at_exit) + return; + + get_task_sched_stats(p, &stats); + printk("SCHED_EXIT[%d] (%s) %llu %llu %llu %llu %llu %lu %lu %lu %lu\n", + p->pid, p->comm, + stats.total_sleep, stats.total_cpu, stats.total_delay, + stats.cycle_count, stats.intr_wake_ups, + p->nvcsw, p->nivcsw, p->cnvcsw, p->cnivcsw); } /** @@ -1223,7 +1781,7 @@ /* * find_idlest_cpu - find the least busy runqueue. */ -static int find_idlest_cpu(struct task_struct *p, int this_cpu, +static int find_idlest_cpu(const struct task_struct *p, int this_cpu, struct sched_domain *sd) { unsigned long load, min_load, this_load; @@ -1275,10 +1833,10 @@ static void sched_migrate_task(task_t *p, int dest_cpu) { migration_req_t req; - runqueue_t *rq; + spinlock_t *rql; unsigned long flags; - rq = task_rq_lock(p, &flags); + rql = task_rq_lock(p, &flags); if (!cpu_isset(dest_cpu, p->cpus_allowed) || unlikely(cpu_is_offline(dest_cpu))) goto out; @@ -1286,16 +1844,16 @@ /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ - struct task_struct *mt = rq->migration_thread; + struct task_struct *mt = p->rq->migration_thread; get_task_struct(mt); - task_rq_unlock(rq, &flags); + task_rq_unlock(rql, &flags); wake_up_process(mt); put_task_struct(mt); wait_for_completion(&req.done); return; } out: - task_rq_unlock(rq, &flags); + task_rq_unlock(rql, &flags); } /* @@ -1335,29 +1893,26 @@ * Both runqueues must be locked. */ static inline -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) +void pull_task(task_t *p, int this_cpu) { - dequeue_task(p, src_array); + runqueue_t *src_rq = p->rq; + + dequeue_task(p); src_rq->nr_running--; + delta_delay_stats(p, adjusted_sched_clock(p)); set_task_cpu(p, this_cpu); - this_rq->nr_running++; - enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) - + this_rq->timestamp_last_tick; - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ - if (TASK_PREEMPTS_CURR(p, this_rq)) - resched_task(this_rq->curr); + p->rq->nr_running++; + enqueue_task(p); + adjust_timestamp(p, src_rq); + adjust_sched_timestamp(p, src_rq); + preempt_curr_if_warranted(p); } /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static inline -int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, +int can_migrate_task(const task_t *p, int this_cpu, struct sched_domain *sd, enum idle_type idle) { /* @@ -1366,7 +1921,7 @@ * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (task_running(rq, p)) + if (task_is_running(p)) return 0; if (!cpu_isset(this_cpu, p->cpus_allowed)) return 0; @@ -1374,7 +1929,7 @@ /* Aggressive migration if we've failed balancing */ if (idle == NEWLY_IDLE || sd->nr_balance_failed < sd->cache_nice_tries) { - if (task_hot(p, rq->timestamp_last_tick, sd)) + if (task_hot(p, sd)) return 0; } @@ -1392,7 +1947,6 @@ unsigned long max_nr_move, struct sched_domain *sd, enum idle_type idle) { - prio_array_t *array, *dst_array; struct list_head *head, *curr; int idx, pulled = 0; task_t *tmp; @@ -1400,51 +1954,30 @@ if (max_nr_move <= 0 || busiest->nr_running <= 1) goto out; - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { - array = busiest->active; - dst_array = this_rq->active; - } - -new_array: /* Start searching at priority 0: */ idx = 0; skip_bitmap: if (!idx) - idx = sched_find_first_bit(array->bitmap); + idx = sched_find_first_bit(busiest->bitmap); else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired && busiest->active->nr_active) { - array = busiest->active; - dst_array = this_rq->active; - goto new_array; - } + idx = find_next_bit(busiest->bitmap, IDLE_PRIO, idx); + if (idx >= IDLE_PRIO) goto out; - } - head = array->queue + idx; + head = &busiest->queues[idx].queue; curr = head->prev; skip_queue: tmp = list_entry(curr, task_t, run_list); curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (!can_migrate_task(tmp, this_cpu, sd, idle)) { if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pull_task(tmp, this_cpu); pulled++; /* We only want to steal up to the prescribed number of tasks. */ @@ -1603,7 +2136,7 @@ /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -static runqueue_t *find_busiest_queue(struct sched_group *group) +static runqueue_t *find_busiest_queue(const struct sched_group *group) { cpumask_t tmp; unsigned long load, max_load = 0; @@ -1882,6 +2415,11 @@ } } } + +static inline int needs_idle_balance(const runqueue_t *rq) +{ + return rq->nr_running == 0; +} #else /* * on UP we do not need to balance between CPUs: @@ -1892,6 +2430,10 @@ static inline void idle_balance(int cpu, runqueue_t *rq) { } +static inline int needs_idle_balance(const runqueue_t *rq) +{ + return 0; +} #endif static inline int wake_priority_sleeper(runqueue_t *rq) @@ -1909,27 +2451,60 @@ return 0; } +/* + * Are promotions due? + */ +static inline int promotions_due(const runqueue_t *rq) +{ + return time_after_eq(jiffies, rq->next_prom_due); +} + +/* + * Assume runqueue lock is NOT already held. + * This is not executed when current task is SCHED_FIFO + */ +static void do_promotions(runqueue_t *rq) +{ + int idx = MAX_RT_PRIO; + + spin_lock(&rq->lock); + /* + * If there's less than 2 tasks running defer the next promotion + */ + if (rq->nr_running < 2) + goto defer_promotion; + for (;;) { + int new_prio; + idx = find_next_bit(rq->bitmap, IDLE_PRIO, idx + 1); + /* don't promote SCHED_BATCH tasks */ + if (idx > (BATCH_PRIO - 1)) + break; + + new_prio = idx - 1; + __list_splice(&rq->queues[idx].queue, rq->queues[new_prio].queue.prev); + INIT_LIST_HEAD(&rq->queues[idx].queue); + __clear_bit(idx, rq->bitmap); + __set_bit(new_prio, rq->bitmap); + /* + * If promotion occurs from the slot + * associated with the current task then the + * current task will be one of those promoted + * so we should update rq' current priority. + * This will only be true for at most one slot. + */ + if (unlikely(idx == get_rq_current_prio(rq))) + set_rq_current_prio(rq, new_prio); + } +defer_promotion: + rq->next_prom_due = (jiffies + get_prom_interval(rq)); + spin_unlock(&rq->lock); +} + DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -#define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ - ((rq)->curr->static_prio > (rq)->best_expired_prio)) - -/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * @@ -1940,10 +2515,11 @@ { int cpu = smp_processor_id(); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - runqueue_t *rq = this_rq(); task_t *p = current; + unsigned long decayed_avg_nr_running; + unsigned long long now; - rq->timestamp_last_tick = sched_clock(); + now = p->rq->timestamp_last_tick = sched_clock(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); @@ -1957,98 +2533,122 @@ sys_ticks = 0; } - if (p == rq->idle) { - if (atomic_read(&rq->nr_iowait) > 0) + /* + * While it's only being used for throughput bonus calculation races + * reading rq->avg_nr_running will be harmless so don't bother locking + */ + decayed_avg_nr_running = SCHED_AVG_MUL(p->rq->avg_nr_running, SCHED_AVG_ALPHA); + if (is_idle_task(p)) { + p->rq->avg_nr_running = decayed_avg_nr_running; + if (sched_mode == SCHED_MODE_ENTITLEMENT_BASED) { + spin_lock(&p->rq->lock); + if (!--p->rq->eb_ticks_to_decay) + decay_eb_yardstick(p->rq); + spin_unlock(&p->rq->lock); + } + if (atomic_read(&p->rq->nr_iowait) > 0) cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; - if (wake_priority_sleeper(rq)) + if (wake_priority_sleeper(p->rq)) goto out; - rebalance_tick(cpu, rq, IDLE); + rebalance_tick(cpu, p->rq, IDLE); return; } - if (TASK_NICE(p) > 0) + p->rq->avg_nr_running = decayed_avg_nr_running + p->rq->nr_running; + if (TASK_NICE(p) > 0 || batch_task(p)) cpustat->nice += user_ticks; else cpustat->user += user_ticks; cpustat->system += sys_ticks; - /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { - set_tsk_need_resched(p); + /* + * SCHED_FIFO tasks never run out of timeslice. + */ + if (unlikely(p->policy == SCHED_FIFO)) goto out; + + spin_lock(&p->rq->lock); + p->rq->cache_ticks++; + if (sched_mode == SCHED_MODE_STAIRCASE) + goto sched_staircase; + if ((sched_mode == SCHED_MODE_ENTITLEMENT_BASED) && (!--p->rq->eb_ticks_to_decay)) { + decay_eb_yardstick(p->rq); + if (likely(!rt_task(p)) && task_should_be_yardstick(p)) + set_eb_yardstick(p); } - spin_lock(&rq->lock); + if (!--p->time_slice) { + dequeue_task(p); + set_tsk_need_resched(p); + if (likely(p->policy != SCHED_RR)) { + apply_sched_avg_decay(&p->avg_delay_per_sub_cycle); + apply_sched_avg_decay(&p->avg_cpu_per_sub_cycle); + delta_cpu_stats(p, now); + p->sub_cycle_count++; + calculate_rates(p); + recalc_throughput_bonus(p); + reassess_cpu_boundness(p); + /* + * Arguably the interactive bonus should be updated here + * as well. But depends on whether we wish to encourage + * interactive tasks to maintain a high bonus or CPU bound + * tasks to lose some of there bonus? + */ + if (sched_mode == SCHED_MODE_ENTITLEMENT_BASED) { + if (task_should_be_yardstick(p)) + set_eb_yardstick(p); + calculate_eb_priority(p); + } + p->prio = effective_prio(p); + } + p->time_slice = task_timeslice(p); + enqueue_task(p); + goto out_unlock; + } + goto check_preempt; +sched_staircase: /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. + * Tasks lose burst each time they use up a full slice(). */ - if (rt_task(p)) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - dequeue_task(p, rq->active); - enqueue_task(p, rq->active); + if (!--p->slice) { + set_tsk_need_resched(p); + dequeue_task(p); + if (unlikely(p->policy == SCHED_RR)) + p->slice = p->time_slice = sched_rr_time_slice_ticks; + else { + dec_burst(p); + p->slice = slice(p); + p->prio = effective_prio(p); + p->time_slice = rr_interval(p); } + enqueue_task(p); goto out_unlock; } + /* + * Tasks that run out of time_slice but still have slice left get + * requeued with a lower priority && rr_interval time_slice. + */ if (!--p->time_slice) { - dequeue_task(p, rq->active); + dequeue_task(p); set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - - dequeue_task(p, rq->active); - set_tsk_need_resched(p); + if (unlikely(p->policy == SCHED_RR)) + p->slice = p->time_slice = sched_rr_time_slice_ticks; + else { p->prio = effective_prio(p); - enqueue_task(p, rq->active); + p->time_slice = rr_interval(p); } + enqueue_task(p); + goto out_unlock; } +check_preempt: + if (p->rq->preempted && p->rq->cache_ticks >= cache_decay_ticks) + set_tsk_need_resched(p); out_unlock: - spin_unlock(&rq->lock); + spin_unlock(&p->rq->lock); out: - rebalance_tick(cpu, rq, NOT_IDLE); + rebalance_tick(cpu, p->rq, NOT_IDLE); + if (unlikely(promotions_due(p->rq))) + do_promotions(p->rq); } #ifdef CONFIG_SCHED_SMT @@ -2079,9 +2679,25 @@ } } -static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) +static inline int proportionally_more_timeslice(const task_t *p1, + const task_t * p2, const struct sched_domain *sd) { - struct sched_domain *sd = rq->sd; + unsigned int ts1, ts2; + + if (sched_mode == SCHED_MODE_STAIRCASE) { + ts1 = p1->slice; + ts2 = slice(p2); + } else { + ts1 = p1->time_slice; + ts2 = task_timeslice(p2); + } + + return (ts1 * (100 - sd->per_cpu_gain) / 100) > ts2; +} + +static inline int dependent_sleeper(int cpu, task_t *p) +{ + struct sched_domain *sd = p->rq->sd; cpumask_t sibling_map; int ret = 0, i; @@ -2107,9 +2723,10 @@ * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(p) || rt_task(smt_curr)) && - p->mm && smt_curr->mm && !rt_task(p)) + if ((proportionally_more_timeslice(smt_curr, p, sd) || + rt_task(smt_curr) || batch_task(p)) && + p->mm && smt_curr->mm && !rt_task(p) && + !batch_task(smt_curr)) ret = 1; /* @@ -2117,20 +2734,31 @@ * or wake it up if it has been put to sleep for priority * reasons. */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(smt_curr) || rt_task(p)) && - smt_curr->mm && p->mm && !rt_task(smt_curr)) || + if (((proportionally_more_timeslice(p, smt_curr, sd) || + rt_task(p) || batch_task(smt_curr)) && + smt_curr->mm && p->mm && !rt_task(smt_curr) && + !batch_task(p)) || (smt_curr == smt_rq->idle && smt_rq->nr_running)) resched_task(smt_curr); } return ret; } + +static inline int dependent_idle(const task_t *p) +{ + return p == p->rq->idle; +} #else static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) { } -static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) +static inline int dependent_sleeper(int cpu, task_t *p) +{ + return 0; +} + +static inline int dependent_idle(const task_t *p) { return 0; } @@ -2144,10 +2772,7 @@ long *switch_count; task_t *prev, *next; runqueue_t *rq; - prio_array_t *array; - struct list_head *queue; unsigned long long now; - unsigned long run_time; int cpu, idx; /* @@ -2169,7 +2794,7 @@ need_resched: preempt_disable(); prev = current; - rq = this_rq(); + rq = prev->rq; /* * The idle thread is not allowed to schedule! @@ -2182,19 +2807,8 @@ release_kernel_lock(prev); now = sched_clock(); - if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) - run_time = now - prev->timestamp; - else - run_time = NS_MAX_SLEEP_AVG; - - /* - * Tasks with interactive credits get charged less run_time - * at high sleep_avg to delay them losing their interactive - * status - */ - if (HIGH_CREDIT(prev)) - run_time /= (CURRENT_BONUS(prev) ? : 1); + prev->runtime = now - prev->timestamp; spin_lock_irq(&rq->lock); /* @@ -2208,70 +2822,43 @@ unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; else - deactivate_task(prev, rq); + deactivate_task(prev); } cpu = smp_processor_id(); - if (unlikely(!rq->nr_running)) { + if (unlikely(needs_idle_balance(rq))) idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; - rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - goto switch_tasks; - } - } - - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, task_t, run_list); - - if (dependent_sleeper(cpu, rq, next)) { - next = rq->idle; + idx = sched_find_first_bit(rq->bitmap); + next = list_entry(rq->queues[idx].queue.next, task_t, run_list); + if (dependent_idle(next)) { + wake_sleeping_dependent(cpu, rq); goto switch_tasks; } - if (!rt_task(next) && next->activated > 0) { - unsigned long long delta = now - next->timestamp; - - if (next->activated == 1) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); - enqueue_task(next, array); - } - next->activated = 0; + if (dependent_sleeper(cpu, next)) + next = rq->idle; switch_tasks: prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) { - prev->sleep_avg = 0; - if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) - prev->interactive_credit--; - } + delta_cpu_stats(prev, now); prev->timestamp = now; + if (unlikely(next->flags & PF_YIELDED)) { + next->flags &= ~PF_YIELDED; + dequeue_task(next); + next->prio = effective_prio(next); + enqueue_task_head(next); + } if (likely(prev != next)) { + rq->preempted = 0; + rq->cache_ticks = 0; + delta_delay_stats(next, now); next->timestamp = now; rq->nr_switches++; - rq->curr = next; + set_rq_current_task(next); ++*switch_count; prepare_arch_switch(rq, next); @@ -2531,9 +3118,9 @@ void set_user_nice(task_t *p, long nice) { unsigned long flags; - prio_array_t *array; - runqueue_t *rq; - int old_prio, new_prio, delta; + int queued; + spinlock_t *rql; + int new_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -2541,7 +3128,7 @@ * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = task_rq_lock(p, &flags); + rql = task_rq_lock(p, &flags); /* * The RT priorities are set via setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -2552,27 +3139,26 @@ p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - array = p->array; - if (array) - dequeue_task(p, array); + if ((queued = task_queued(p))) + dequeue_task(p); - old_prio = p->prio; new_prio = NICE_TO_PRIO(nice); - delta = new_prio - old_prio; - p->static_prio = NICE_TO_PRIO(nice); + delta = new_prio - p->static_prio; + p->static_prio = new_prio; p->prio += delta; + p->eb_shares = nice_to_shares(nice); - if (array) { - enqueue_task(p, array); + if (queued) { + enqueue_task(p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_task(rq->curr); + if (delta < 0 || (delta > 0 && task_is_running(p))) + resched_task(p->rq->curr); } out_unlock: - task_rq_unlock(rq, &flags); + task_rq_unlock(rql, &flags); } EXPORT_SYMBOL(set_user_nice); @@ -2675,10 +3261,10 @@ /* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct task_struct *p, int policy, int prio) { - BUG_ON(p->array); + BUG_ON(task_queued(p)); p->policy = policy; p->rt_priority = prio; - if (policy != SCHED_NORMAL) + if (SCHED_RT(policy)) p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; else p->prio = p->static_prio; @@ -2692,9 +3278,9 @@ struct sched_param lp; int retval = -EINVAL; int oldprio; - prio_array_t *array; + int queued; unsigned long flags; - runqueue_t *rq; + spinlock_t *rql; task_t *p; if (!param || pid < 0) @@ -2719,14 +3305,13 @@ * To be able to change p->policy safely, the apropriate * runqueue lock must be held. */ - rq = task_rq_lock(p, &flags); + rql = task_rq_lock(p, &flags); if (policy < 0) policy = p->policy; else { retval = -EINVAL; - if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL) + if (!SCHED_RANGE(policy)) goto out_unlock; } @@ -2737,43 +3322,51 @@ retval = -EINVAL; if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) goto out_unlock; - if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0)) + if (!SCHED_RT(policy) != (lp.sched_priority == 0)) goto out_unlock; retval = -EPERM; - if ((policy == SCHED_FIFO || policy == SCHED_RR) && - !capable(CAP_SYS_NICE)) - goto out_unlock; + if (SCHED_RT(policy) && !capable(CAP_SYS_NICE)) + /* + * If the caller requested an RT policy without having the + * necessary rights, we downgrade the policy to SCHED_ISO. + */ + policy = SCHED_ISO; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) goto out_unlock; + if (!(p->mm) && policy == SCHED_BATCH) + /* + * Don't allow kernel threads to be SCHED_BATCH. + */ + goto out_unlock; + retval = security_task_setscheduler(p, policy, &lp); if (retval) goto out_unlock; - array = p->array; - if (array) - deactivate_task(p, task_rq(p)); + if ((queued = task_queued(p))) + deactivate_task(p); retval = 0; oldprio = p->prio; __setscheduler(p, policy, lp.sched_priority); - if (array) { - __activate_task(p, task_rq(p)); + if (queued) { + __activate_task(p); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_running(rq, p)) { + if (task_is_running(p)) { if (p->prio > oldprio) - resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + resched_task(p); + } else + preempt_curr_if_warranted(p); } out_unlock: - task_rq_unlock(rq, &flags); + task_rq_unlock(rql, &flags); out_unlock_tasklist: read_unlock_irq(&tasklist_lock); @@ -2973,37 +3566,131 @@ return real_len; } +void get_task_sched_stats(const struct task_struct *tsk, struct task_sched_stats *stats) +{ + int on_runq = 0; + int on_cpu = 0; + unsigned long long timestamp; + unsigned long flags; + spinlock_t *rql = task_rq_lock(tsk, &flags); + + stats->timestamp = tsk->rq->timestamp_last_tick; + stats->cycle_count = tsk->cycle_count; + stats->total_sleep = tsk->total_sleep; + stats->total_cpu = tsk->total_cpu; + stats->total_delay = tsk->total_delay; + stats->intr_wake_ups = tsk->intr_wake_ups; + timestamp = tsk->sched_timestamp; + if ((on_runq = task_queued(tsk))) + on_cpu = tsk->rq->idle == tsk; + + task_rq_unlock(rql, &flags); + + /* + * Update values to the previous tick (only) + */ + if (stats->timestamp > timestamp) { + unsigned long long delta = stats->timestamp - timestamp; + + if (on_cpu) { + stats->total_cpu += delta; + } else if (on_runq) { + stats->total_delay += delta; + } else { + stats->total_sleep += delta; + } + } +} + +EXPORT_SYMBOL(get_task_sched_stats); + +/* + * Get scheduling statistics for the nominated CPU + */ +void get_cpu_sched_stats(unsigned int cpu, struct cpu_sched_stats *stats) +{ + int idle; + unsigned long long idle_timestamp; + runqueue_t *rq = cpu_rq(cpu); + + /* + * No need to crash the whole machine if they've asked for stats for + * a non existent CPU, just send back zero. + */ + if (rq == NULL) { + stats->timestamp = 0; + stats->total_idle = 0; + stats->total_busy = 0; + stats->total_delay = 0; + stats->nr_switches = 0; + + return; + } + local_irq_disable(); + spin_lock(&rq->lock); + idle = rq->curr == rq->idle; + stats->timestamp = rq->timestamp_last_tick; + idle_timestamp = rq->idle->sched_timestamp; + stats->total_idle = rq->idle->total_cpu; + stats->total_busy = rq->idle->total_delay; + stats->total_delay = rq->total_delay; + stats->nr_switches = rq->nr_switches; + spin_unlock_irq(&rq->lock); + + /* + * Update idle/busy time to the current tick + */ + if (idle) + stats->total_idle += (stats->timestamp - idle_timestamp); + else + stats->total_busy += (stats->timestamp - idle_timestamp); +} + +EXPORT_SYMBOL(get_cpu_sched_stats); + /** * sys_sched_yield - yield the current processor to other threads. * - * this function yields the current CPU by moving the calling thread - * to the expired array. If there are no other threads running on this * CPU then this function will return. */ asmlinkage long sys_sched_yield(void) { - runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; - prio_array_t *target = rq->expired; + spinlock_t *rql = this_rq_lock(); - /* - * We implement yielding by moving the task into the expired - * queue. + dequeue_task(current); + if (sched_mode == SCHED_MODE_STAIRCASE) + goto yield_staircase; + /* If there's other tasks on this CPU make sure that at least + * one of them get some CPU before this task's next bite of the + * cherry. Dequeue before looking for the appropriate run + * queue so that we don't find our queue if we were the sole + * occupant of that queue. * - * (special rule: RT tasks will just roundrobin in the active - * array.) + * special rule: RT tasks will just roundrobin. */ - if (rt_task(current)) - target = rq->active; - - dequeue_task(current, array); - enqueue_task(current, target); + if (likely(!rt_task(current))) { + runqueue_t *rq = current->rq; + int idx = find_next_bit(rq->bitmap, IDLE_PRIO, get_rq_current_prio(rq)); + if (idx < IDLE_PRIO) + current->prio = idx; + } + goto out; +yield_staircase: + current->slice = slice(current); + current->time_slice = RR_INTERVAL(); + if (likely(!rt_task(current) && !batch_task(current))) { + current->flags |= PF_YIELDED; + current->prio = MAX_PRIO - 2; + } + current->burst = 0; +out: + enqueue_task(current); /* * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - _raw_spin_unlock(&rq->lock); + _raw_spin_unlock(rql); preempt_enable_no_resched(); schedule(); @@ -3079,6 +3766,8 @@ ret = MAX_USER_RT_PRIO-1; break; case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_ISO: ret = 0; break; } @@ -3102,6 +3791,8 @@ ret = 1; break; case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_ISO: ret = 0; } return ret; @@ -3135,8 +3826,13 @@ if (retval) goto out_unlock; - jiffies_to_timespec(p->policy & SCHED_FIFO ? + if (sched_mode == SCHED_MODE_STAIRCASE) + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : slice(p), &t); + else + jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : task_timeslice(p), &t); + read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -3249,15 +3945,26 @@ runqueue_t *rq = cpu_rq(cpu); unsigned long flags; - idle->sleep_avg = 0; - idle->interactive_credit = 0; - idle->array = NULL; - idle->prio = MAX_PRIO; + idle->prio = IDLE_PRIO; + /* + * Initialize scheduling statistics counters as they may provide + * valuable about the CPU + */ + initialize_stats(idle); + initialize_bonuses(idle); idle->state = TASK_RUNNING; + idle->burst = 0; set_task_cpu(idle, cpu); spin_lock_irqsave(&rq->lock, flags); - rq->curr = rq->idle = idle; + rq->idle = idle; + set_rq_current_task(idle); + idle->sched_timestamp = adjusted_sched_clock(idle); + /* + * Putting the idle process onto a run queue simplifies the selection of + * the next task to run in schedule(). + */ + enqueue_task(idle); set_tsk_need_resched(idle); spin_unlock_irqrestore(&rq->lock, flags); @@ -3309,11 +4016,11 @@ unsigned long flags; int ret = 0; migration_req_t req; - runqueue_t *rq; + spinlock_t *rql; perfctr_set_cpus_allowed(p, new_mask); - rq = task_rq_lock(p, &flags); + rql = task_rq_lock(p, &flags); if (!cpus_intersects(new_mask, cpu_online_map)) { ret = -EINVAL; goto out; @@ -3326,14 +4033,14 @@ if (migrate_task(p, any_online_cpu(new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, &flags); - wake_up_process(rq->migration_thread); + task_rq_unlock(rql, &flags); + wake_up_process(p->rq->migration_thread); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); return 0; } out: - task_rq_unlock(rq, &flags); + task_rq_unlock(rql, &flags); return ret; } @@ -3366,21 +4073,25 @@ if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; - set_task_cpu(p, dest_cpu); - if (p->array) { + if (task_queued(p)) { + /* + * Don't do set_task_cpu() until AFTER we dequeue the task, + * since dequeue_task() relies on p->rq always being accurate. + */ + deactivate_task(p); + delta_delay_stats(p, adjusted_sched_clock(p)); + set_task_cpu(p, dest_cpu); /* - * Sync timestamp with rq_dest's before activating. - * The same thing could be achieved by doing this step - * afterwards, and pretending it was a local activate. - * This way is cleaner and logically correct. + * activate_task() will set the timestamp correctly so there's + * no need to adjust it here */ - p->timestamp = p->timestamp - rq_src->timestamp_last_tick - + rq_dest->timestamp_last_tick; - deactivate_task(p, rq_src); - activate_task(p, rq_dest, 0); - if (TASK_PREEMPTS_CURR(p, rq_dest)) - resched_task(rq_dest->curr); + activate_task(p); + preempt_curr_if_warranted(p); + } else { + delta_sleep_stats(p, adjusted_sched_clock(p)); + set_task_cpu(p, dest_cpu); } + adjust_sched_timestamp(p, rq_src); out: double_rq_unlock(rq_src, rq_dest); @@ -3527,9 +4238,10 @@ */ spin_lock_irqsave(&rq->lock, flags); + dequeue_task(p); __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); /* Add idle task to _front_ of it's priority queue */ - __activate_idle_task(p, rq); + __activate_task_head(p); spin_unlock_irqrestore(&rq->lock, flags); } @@ -3544,7 +4256,10 @@ { int cpu = (long)hcpu; struct task_struct *p; +#ifdef CONFIG_HOTPLUG_CPU struct runqueue *rq; +#endif + spinlock_t *rql; unsigned long flags; switch (action) { @@ -3555,9 +4270,9 @@ p->flags |= PF_NOFREEZE; kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ - rq = task_rq_lock(p, &flags); + rql = task_rq_lock(p, &flags); __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); - task_rq_unlock(rq, &flags); + task_rq_unlock(rql, &flags); cpu_rq(cpu)->migration_thread = p; break; case CPU_ONLINE: @@ -3576,12 +4291,13 @@ rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); rq->migration_thread = NULL; - /* Idle task back to normal (off runqueue, low prio) */ - rq = task_rq_lock(rq->idle, &flags); - deactivate_task(rq->idle, rq); - rq->idle->static_prio = MAX_PRIO; + /* Idle task back to normal in IDLE_PRIO slot */ + rql = task_rq_lock(rq->idle, &flags); + deactivate_task(rq->idle); + rq->idle->static_prio = IDLE_PRIO; __setscheduler(rq->idle, SCHED_NORMAL, 0); - task_rq_unlock(rq, &flags); + enqueue_task(rq->idle); + task_rq_unlock(rql, &flags); BUG_ON(rq->nr_running != 0); /* No need to migrate the tasks: it was best-effort if @@ -3966,7 +4682,7 @@ void __init sched_init(void) { runqueue_t *rq; - int i, j, k; + int i, k; #ifdef CONFIG_SMP /* Set up an initial dummy domain for early boot */ @@ -3987,13 +4703,11 @@ #endif for (i = 0; i < NR_CPUS; i++) { - prio_array_t *array; - rq = cpu_rq(i); spin_lock_init(&rq->lock); - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; + + rq->cache_ticks = 0; + rq->preempted = 0; #ifdef CONFIG_SMP rq->sd = &sched_domain_init; @@ -4005,16 +4719,22 @@ #endif atomic_set(&rq->nr_iowait, 0); - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); + for (k = 0; k <= IDLE_PRIO; k++) { + rq->queues[k].prio = k; + INIT_LIST_HEAD(&rq->queues[k].queue); } + bitmap_zero(rq->bitmap, NUM_PRIO_SLOTS); + /* delimiter for bitsearch */ + __set_bit(IDLE_PRIO, rq->bitmap); + set_rq_current_prio(rq, (IDLE_PRIO - 21)); + rq->timestamp_last_tick = sched_clock(); + rq->next_prom_due = (jiffies + get_prom_interval(rq)); + rq->total_delay = 0; + rq->eb_yardstick = 0; + rq->eb_ticks_to_decay = time_slice_ticks; + rq->avg_nr_running = 0; } + current->rq = this_rq(); /* needed for non-SMP systems */ /* * The boot idle thread does lazy MMU switching as well: @@ -4090,3 +4810,290 @@ EXPORT_SYMBOL(__preempt_write_lock); #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */ + +#if defined(CONFIG_SYSCTL) +/* + * CPU scheduler control via /proc/sys/cpusched/xxx + */ +enum +{ + CPU_SCHED_END_OF_LIST=0, + CPU_SCHED_TIME_SLICE=1, + CPU_SCHED_SCHED_RR_TIME_SLICE, + CPU_SCHED_BASE_PROMOTION_INTERVAL, + CPU_SCHED_MAX_IA_BONUS, + CPU_SCHED_MAX_TPT_BONUS, + CPU_SCHED_IA_THRESHOLD, + CPU_SCHED_CPU_HOG_THRESHOLD, + CPU_SCHED_LOG_AT_EXIT, + CPU_SCHED_INTERACTIVE, + CPU_SCHED_COMPUTE, + CPU_SCHED_MODE, + CPU_SCHED_INITIAL_IA_BONUS, + CPU_SCHED_HOG_SUB_CYCLE_THRESHOLD, + CPU_SCHED_SCHED_ISO_THRESHOLD, + CPU_SCHED_SCHED_BATCH_TIME_SLICE_MULTIPLIER +}; + +static const unsigned int zero = 0; +static const unsigned int one = 1; +#define min_milli_value zero +static const unsigned int max_milli_value = 1000; +#define min_max_ia_bonus zero +static const unsigned int max_max_ia_bonus = MAX_MAX_IA_BONUS; +#define min_max_tpt_bonus zero +static const unsigned int max_max_tpt_bonus = MAX_MAX_TPT_BONUS; +static unsigned int time_slice_msecs = DEFAULT_TIME_SLICE_MSECS; +static unsigned int sched_rr_time_slice_msecs = DEFAULT_TIME_SLICE_MSECS; +#define min_time_slice_msecs one +static const unsigned int max_time_slice_msecs = MAX_TIME_SLICE_MSECS; +static unsigned int base_prom_interval_msecs = BASE_PROM_INTERVAL_MSECS; +#define min_base_prom_interval_msecs one +static const unsigned int max_base_prom_interval_msecs = INT_MAX; +#define max_hog_sub_cycle_threshold max_base_prom_interval_msecs +#define min_sched_batch_time_slice_multiplier one +static const unsigned int max_sched_batch_time_slice_multiplier = 100; + +static int proc_time_slice_msecs(ctl_table *ctp, int write, struct file *fp, + void __user *buffer, size_t *lenp) +{ + int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); + + if ((res == 0) && write) + time_slice_ticks = MSECS_TO_JIFFIES_MIN_1(time_slice_msecs); + + return res; +} + +static int proc_sched_rr_time_slice_msecs(ctl_table *ctp, int write, struct file *fp, + void __user *buffer, size_t *lenp) +{ + int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); + + if ((res == 0) && write) + sched_rr_time_slice_ticks = MSECS_TO_JIFFIES_MIN_1(sched_rr_time_slice_msecs); + + return res; +} + +static int proc_base_prom_interval_msecs(ctl_table *ctp, int write, struct file *fp, + void __user *buffer, size_t *lenp) +{ + int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); + + if ((res == 0) && write) + base_prom_interval_ticks = MSECS_TO_JIFFIES_MIN_1(base_prom_interval_msecs); + + return res; +} + +static int proc_cpu_hog_threshold(ctl_table *ctp, int write, struct file *fp, + void __user *buffer, size_t *lenp) +{ + int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); + + if ((res == 0) && write) + cpu_hog_threshold = calc_proportion(cpu_hog_threshold_ppt, 1000); + + return res; +} + +static int proc_ia_threshold(ctl_table *ctp, int write, struct file *fp, + void __user *buffer, size_t *lenp) +{ + int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); + + if ((res == 0) && write) + ia_threshold = calc_proportion(ia_threshold_ppt, 1000); + + return res; +} + +static int proc_sched_iso_threshold(ctl_table *ctp, int write, struct file *fp, + void __user *buffer, size_t *lenp) +{ + int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); + + if ((res == 0) && write) + sched_iso_threshold = calc_proportion(sched_iso_threshold_ppt, 1000); + + return res; +} + +#define SCHED_MODE_BUFFER_LEN 16 +static char current_sched_mode[SCHED_MODE_BUFFER_LEN] = ""; +static int proc_sched_mode(ctl_table *ctp, int write, struct file *fp, + void __user *buffer, size_t *lenp) +{ + int res; + + strcpy(current_sched_mode, sched_mode_names[sched_mode]); + res = proc_dostring(ctp, write, fp, buffer, lenp); + + if ((res == 0) && write) { + int i; + + for (i = 0; sched_mode_names[i] != NULL; i++) + if (strcmp(current_sched_mode, sched_mode_names[i]) == 0) + break; + if (sched_mode_names[i] == NULL) + res = -EINVAL; + else /* set the scheduling mode */ + sched_mode = i; + } + + return res; +} + +ctl_table cpu_sched_table[] = { + { + .ctl_name = CPU_SCHED_TIME_SLICE, + .procname = "time_slice", + .data = &time_slice_msecs, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_time_slice_msecs, + .extra1 = (void *)&min_time_slice_msecs, + .extra2 = (void *)&max_time_slice_msecs + }, + { + .ctl_name = CPU_SCHED_SCHED_RR_TIME_SLICE, + .procname = "sched_rr_time_slice", + .data = &sched_rr_time_slice_msecs, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_sched_rr_time_slice_msecs, + .extra1 = (void *)&min_time_slice_msecs, + .extra2 = (void *)&max_time_slice_msecs + }, + { + .ctl_name = CPU_SCHED_BASE_PROMOTION_INTERVAL, + .procname = "base_promotion_interval", + .data = &base_prom_interval_msecs, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_base_prom_interval_msecs, + .extra1 = (void *)&min_base_prom_interval_msecs, + .extra2 = (void *)&max_base_prom_interval_msecs + }, + { + .ctl_name = CPU_SCHED_MAX_IA_BONUS, + .procname = "max_ia_bonus", + .data = &max_ia_bonus, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *)&min_max_ia_bonus, + .extra2 = (void *)&max_max_ia_bonus + }, + { + .ctl_name = CPU_SCHED_INITIAL_IA_BONUS, + .procname = "initial_ia_bonus", + .data = &initial_ia_bonus, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *)&min_max_ia_bonus, + .extra2 = (void *)&max_max_ia_bonus + }, + { + .ctl_name = CPU_SCHED_MAX_TPT_BONUS, + .procname = "max_tpt_bonus", + .data = &max_tpt_bonus, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *)&min_max_tpt_bonus, + .extra2 = (void *)&max_max_tpt_bonus + }, + { + .ctl_name = CPU_SCHED_HOG_SUB_CYCLE_THRESHOLD, + .procname = "hog_sub_cycle_threshold", + .data = &hog_sub_cycle_threshold, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *)&zero, + .extra2 = (void *)&max_hog_sub_cycle_threshold + }, + { + .ctl_name = CPU_SCHED_IA_THRESHOLD, + .procname = "ia_threshold", + .data = &ia_threshold_ppt, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_ia_threshold, + .extra1 = (void *)&min_milli_value, + .extra2 = (void *)&max_milli_value + }, + { + .ctl_name = CPU_SCHED_CPU_HOG_THRESHOLD, + .procname = "cpu_hog_threshold", + .data = &cpu_hog_threshold_ppt, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_cpu_hog_threshold, + .extra1 = (void *)&min_milli_value, + .extra2 = (void *)&max_milli_value + }, + { + .ctl_name = CPU_SCHED_SCHED_ISO_THRESHOLD, + .procname = "sched_iso_threshold", + .data = &sched_iso_threshold_ppt, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_sched_iso_threshold, + .extra1 = (void *)&min_milli_value, + .extra2 = (void *)&max_milli_value + }, + { + .ctl_name = CPU_SCHED_LOG_AT_EXIT, + .procname = "log_at_exit", + .data = &log_at_exit, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *)&zero, + .extra2 = (void *)&one + }, + { + .ctl_name = CPU_SCHED_INTERACTIVE, + .procname = "interactive", + .data = &sched_interactive, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *)&zero, + .extra2 = (void *)&one + }, + { + .ctl_name = CPU_SCHED_COMPUTE, + .procname = "compute", + .data = &sched_compute, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *)&zero, + .extra2 = (void *)&one + }, + { + .ctl_name = CPU_SCHED_MODE, + .procname = "mode", + .data = ¤t_sched_mode, + .maxlen = SCHED_MODE_BUFFER_LEN, + .mode = 0644, + .proc_handler = &proc_sched_mode, + }, + { + .ctl_name = CPU_SCHED_SCHED_BATCH_TIME_SLICE_MULTIPLIER, + .procname = "sched_batch_time_slice_multiplier", + .data = &sched_batch_time_slice_multiplier, + .maxlen = sizeof (unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *)&min_sched_batch_time_slice_multiplier, + .extra2 = (void *)&max_sched_batch_time_slice_multiplier + }, + { .ctl_name = CPU_SCHED_END_OF_LIST } +}; +#endif Index: Linux-2.6.X/kernel/sysctl.c diff -u Linux-2.6.X/kernel/sysctl.c:1.1.1.12 Linux-2.6.X/kernel/sysctl.c:1.1.1.12.6.1 --- Linux-2.6.X/kernel/sysctl.c:1.1.1.12 Fri Jul 30 11:31:18 2004 +++ Linux-2.6.X/kernel/sysctl.c Sun Aug 1 16:17:46 2004 @@ -148,6 +148,10 @@ #ifdef CONFIG_UNIX98_PTYS extern ctl_table pty_table[]; #endif +/* + * CPU scheduler control variables (lives in sched.c) + */ +extern ctl_table cpu_sched_table[]; /* /proc declarations: */ @@ -636,6 +640,12 @@ .proc_handler = &proc_unknown_nmi_panic, }, #endif + { + .ctl_name = KERN_CPU_SCHED, + .procname = "cpusched", + .mode = 0555, + .child = cpu_sched_table, + }, { .ctl_name = 0 } };