From: Mike Galbraith <mgalbraith@suse.de>
Subject: sched, cpusets: "HPC" cpusets extension
Date: Tue Jan 17 09:33:11 CET 2012
Patch-mainline: Never, RT specific
References: SLE Realtime Extension
Give the user the ability to dynamically influence scheduler behavior
through "HPC" cpusets. (hack alert)
When enabled, the user can dynamically inform the scheduler that a
cpuset cannot tolerate jitter induced by NO_HZ, jiffies update, and
RT load balancing locic. A large generic machine can re-partition
to service transient HPC loads without requiring the entire machine
to run nohz=off continuously.
Should the user invalidate "HPC" prerequisites, modifiers are self
canceling for safety reasons. Prerequisites are: the set may not
contain CPU0, must be cpu exclusive (obviously), and must be fully
disconnected from scheduler domains.
Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
---
Documentation/admin-guide/kernel-parameters.txt | 18 +
include/linux/sched.h | 21 +
init/Kconfig | 11 +
kernel/cgroup/cpuset.c | 261 ++++++++++++++++++++++++
kernel/sched/core.c | 84 +++++++
kernel/sched/deadline.c | 9
kernel/sched/rt.c | 23 +-
kernel/sched/sched.h | 16 +
8 files changed, 436 insertions(+), 7 deletions(-)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1415,6 +1415,24 @@
hpet_mmap= [X86, HPET_MMAP] Allow userspace to mmap HPET
registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
+ hpc_cpusets [KNL] activate HPC extensions to cpusets. If HPC_CPUSETS
+ is enabled in .config, adds per cpuset scheduler behavior
+ modifiers to reduce jitter of isolated cpuset CPUs.
+ Currently, these modifiers include:
+
+ sched_hpc: switch nohz off for the cpuset, and elect
+ CPU0 for jiffies update duty. This implies that CPU0
+ may not be in any HPC cpuset, as it serves all sets.
+
+ sched_hpc_rt: set sched_hpc, and additionally disable
+ rt push/pull logic. This option extends the isolation
+ achieved by setting sched_load_balance to 0 in parent
+ and child set, to also disable RT load balancing. It
+ explictly informs the scheduler that the user assumes
+ responsibility for realtime class task placement in
+ addition to the fair class responsibility assumed when
+ initially isolating the cpuset.
+
hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
On x86-64 and powerpc, this option can be specified
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1994,6 +1994,27 @@ extern long sched_getaffinity(pid_t pid,
#define TASK_SIZE_OF(tsk) TASK_SIZE
#endif
+/* Cpuset runqueue behavior modifier flags */
+enum
+{
+ RQ_TICK = 1 << 0,
+ RQ_HPC = 1 << 1,
+ RQ_HPCRT = 1 << 2,
+ RQ_CLEAR = ~0,
+};
+
+#ifdef CONFIG_HPC_CPUSETS
+extern int runqueue_is_flagged(int cpu, unsigned flag);
+extern int runqueue_is_isolated(int cpu);
+extern void cpuset_flags_set(int cpu, unsigned bits);
+extern void cpuset_flags_clr(int cpu, unsigned bits);
+#else /* !CONFIG_HPC_CPUSETS */
+static inline int runqueue_is_flagged(int cpu, unsigned flag) { return 0; }
+static inline int runqueue_is_isolated(int cpu) { return 0; }
+static inline void cpuset_flag_set(int cpu, unsigned bits) { }
+static inline void cpuset_flag_clr(int cpu, unsigned bits) { }
+#endif /* CONFIG_HPC_CPUSETS */
+
#ifdef CONFIG_RSEQ
/*
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -957,6 +957,17 @@ config CPUSETS
Say N if unsure.
+config HPC_CPUSETS
+ bool "HPC cpusets"
+ depends on CPUSETS && SMP
+ default n
+ help
+ This option provides per CPUSET scheduler behavior control switches.
+ This is primarily useful on large SMP systems where some partitions
+ may be dedicated to sensitive HPC applications, while others are not.
+
+ Say N if unsure.
+
config PROC_PID_CPUSET
bool "Include legacy /proc/<pid>/cpuset file"
depends on CPUSETS
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -215,6 +215,8 @@ typedef enum {
CS_SCHED_LOAD_BALANCE,
CS_SPREAD_PAGE,
CS_SPREAD_SLAB,
+ CS_SCHED_HPC,
+ CS_SCHED_HPCRT,
} cpuset_flagbits_t;
/* convenient tests for these bits */
@@ -263,6 +265,16 @@ static inline int is_partition_root(cons
return cs->partition_root_state > 0;
}
+static inline int is_sched_hpc(const struct cpuset *cs)
+{
+ return test_bit(CS_SCHED_HPC, &cs->flags);
+}
+
+static inline int is_sched_hpc_rt(const struct cpuset *cs)
+{
+ return test_bit(CS_SCHED_HPCRT, &cs->flags);
+}
+
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
@@ -529,6 +541,172 @@ static inline void free_cpuset(struct cp
kfree(cs);
}
+#ifdef CONFIG_HPC_CPUSETS
+/* Without boot parameter "hpc_cpusets", HPC functionality is disabled */
+static __read_mostly int hpc_cpusets_enabled;
+
+/**
+ * validate_sched_change() - validate proposed scheduler modifier changes.
+ *
+ * If we replaced the flag and mask values of the current cpuset (cur) with
+ * those values in the trial cpuset (trial), would our various subset and
+ * exclusive rules still be valid? For cpusets with scheduler modifiers,
+ * ensure that CPUs entering/leaving set/clear runqueue flags accordingly,
+ * to ensure that cpuset and runqueue states remain in sync.
+ *
+ * @cur: address of an actual, in-use cpuset.
+ * @trial: address of copy of cur, with proposed changes.
+ *
+ * Presumes cpuset_mutex held.
+ * Return 0 if valid, -errno if not.
+ */
+static int
+validate_sched_change(const struct cpuset *cur, const struct cpuset *trial)
+{
+ int cpu;
+
+ if (!hpc_cpusets_enabled || !is_sched_hpc(trial))
+ return 0;
+
+ cpu = cpumask_first(trial->cpus_allowed);
+
+ if (cur == &top_cpuset || !is_cpu_exclusive(cur))
+ return -EINVAL;
+ /*
+ * HPC cpusets may not contain the boot CPU,
+ * and must be completely isolated or empty.
+ */
+ if (!cpu || is_sched_load_balance(cur))
+ return -EINVAL;
+ if (cpu < nr_cpu_ids && !runqueue_is_isolated(cpu))
+ return -EINVAL;
+
+ /* Handle CPUs entering or leaving the set */
+ if (!cpumask_equal(cur->cpus_allowed, trial->cpus_allowed)) {
+ cpumask_var_t delta;
+ int entering, cpu;
+ unsigned bits;
+
+ if (!zalloc_cpumask_var(&delta, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_xor(delta, cur->cpus_allowed, trial->cpus_allowed);
+ entering = cpumask_weight(cur->cpus_allowed) <
+ cpumask_weight(trial->cpus_allowed);
+
+ bits = RQ_TICK | RQ_HPC;
+ if (is_sched_hpc_rt(trial))
+ bits |= RQ_HPCRT;
+
+ if (entering) {
+ for_each_cpu(cpu, delta) {
+ if (runqueue_is_isolated(cpu))
+ continue;
+ free_cpumask_var(delta);
+ return -EINVAL;
+ }
+ }
+
+ for_each_cpu(cpu, delta) {
+ if (entering)
+ cpuset_flags_set(cpu, bits);
+ else
+ cpuset_flags_clr(cpu, bits);
+ }
+ free_cpumask_var(delta);
+ }
+
+ return 0;
+}
+
+/*
+ * update_sched_flags - update scheduler modifier flags in cpusets.
+ * @bit: the bit changing state.
+ * @cs: the cpuset in which flags need to be updated:
+ * @turning_on: whether we're turning the bit on or off.
+ *
+ * Called with cgroup_mutex held. Turn scheduler modifiers on/off,
+ * updating runqueue flags for associated CPUs. Set/clear of a flag
+ * which invalidates modifiers recursively clears invalidated flags
+ * for child cpusets and their associated CPUs.
+ *
+ * No return value.
+ */
+static void
+update_sched_flags(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+ unsigned cpu, bits = 0, recursive = 0;
+
+ switch (bit) {
+ case CS_CPU_EXCLUSIVE:
+ if (turning_on)
+ return;
+ bits = RQ_CLEAR;
+ recursive = 1;
+ break;
+ case CS_SCHED_LOAD_BALANCE:
+ if (!turning_on)
+ return;
+ if (is_sched_hpc(cs)) {
+ bits |= RQ_TICK | RQ_HPC;
+ clear_bit(CS_SCHED_HPC, &cs->flags);
+ }
+ if (is_sched_hpc_rt(cs)) {
+ bits |= RQ_HPCRT;
+ clear_bit(CS_SCHED_HPCRT, &cs->flags);
+ }
+ recursive = 1;
+ break;
+ case CS_SCHED_HPC:
+ bits = RQ_TICK | RQ_HPC;
+ break;
+ case CS_SCHED_HPCRT:
+ bits = RQ_HPCRT;
+ break;
+ default:
+ return;
+ }
+
+ /* Kill lockdep rq->lock false positive */
+ lockdep_off();
+
+ if (recursive) {
+ cpuset_for_each_child(child, css, cs)
+ update_sched_flags(bit, child, turning_on);
+ turning_on = 0;
+ }
+
+ if (!bits)
+ goto out;
+
+ for_each_cpu(cpu, cs->cpus_allowed) {
+ if (turning_on)
+ cpuset_flags_set(cpu, bits);
+ else
+ cpuset_flags_clr(cpu, bits);
+ }
+out:
+ lockdep_on();
+}
+
+static void hpc_cpusets_disable(void);
+
+#else /* !CONFIG_HPC_CPUSETS */
+
+static int
+validate_sched_change(const struct cpuset *cur, const struct cpuset *trial)
+{
+ return 0;
+}
+static void
+update_sched_flags(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { }
+
+static void hpc_cpusets_disable(void) { }
+
+#endif /* CONFIG_HPC_CPUSETS */
+
/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
@@ -563,6 +741,10 @@ static int validate_change(struct cpuset
if (!is_cpuset_subset(c, trial))
goto out;
+ ret = validate_sched_change(cur, trial);
+ if (ret)
+ return ret;
+
/* Remaining checks don't apply to root cpuset */
ret = 0;
if (cur == &top_cpuset)
@@ -1864,6 +2046,7 @@ static int update_flag(cpuset_flagbits_t
struct cpuset *trialcs;
int balance_flag_changed;
int spread_flag_changed;
+ int sched_flag_changed;
int err;
trialcs = alloc_trial_cpuset(cs);
@@ -1882,6 +2065,11 @@ static int update_flag(cpuset_flagbits_t
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(trialcs));
+ sched_flag_changed = balance_flag_changed;
+ sched_flag_changed |= (is_cpu_exclusive(cs) != is_cpu_exclusive(trialcs));
+ sched_flag_changed |= (is_sched_hpc(cs) != is_sched_hpc(trialcs));
+ sched_flag_changed |= (is_sched_hpc_rt(cs) != is_sched_hpc_rt(trialcs));
+
spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
|| (is_spread_page(cs) != is_spread_page(trialcs)));
@@ -1892,6 +2080,9 @@ static int update_flag(cpuset_flagbits_t
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
rebuild_sched_domains_locked();
+ if (sched_flag_changed)
+ update_sched_flags(bit, cs, turning_on);
+
if (spread_flag_changed)
update_tasks_flags(cs);
out:
@@ -2237,6 +2428,8 @@ typedef enum {
FILE_MEMORY_PRESSURE,
FILE_SPREAD_PAGE,
FILE_SPREAD_SLAB,
+ FILE_SCHED_HPC,
+ FILE_SCHED_HPCRT,
} cpuset_filetype_t;
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -2277,6 +2470,18 @@ static int cpuset_write_u64(struct cgrou
case FILE_SPREAD_SLAB:
retval = update_flag(CS_SPREAD_SLAB, cs, val);
break;
+ case FILE_SCHED_HPC:
+ if (!val && is_sched_hpc_rt(cs))
+ retval = update_flag(CS_SCHED_HPCRT, cs, val);
+ if (!retval)
+ retval = update_flag(CS_SCHED_HPC, cs, val);
+ break;
+ case FILE_SCHED_HPCRT:
+ if (val && !is_sched_hpc(cs))
+ retval = update_flag(CS_SCHED_HPC, cs, val);
+ if (!retval)
+ retval = update_flag(CS_SCHED_HPCRT, cs, val);
+ break;
default:
retval = -EINVAL;
break;
@@ -2429,6 +2634,10 @@ static u64 cpuset_read_u64(struct cgroup
return is_mem_hardwall(cs);
case FILE_SCHED_LOAD_BALANCE:
return is_sched_load_balance(cs);
+ case FILE_SCHED_HPC:
+ return is_sched_hpc(cs);
+ case FILE_SCHED_HPCRT:
+ return is_sched_hpc_rt(cs);
case FILE_MEMORY_MIGRATE:
return is_memory_migrate(cs);
case FILE_MEMORY_PRESSURE_ENABLED:
@@ -2614,6 +2823,25 @@ static struct cftype legacy_files[] = {
.private = FILE_MEMORY_PRESSURE_ENABLED,
},
+#ifdef CONFIG_HPC_CPUSETS
+ /* These MUST be the last array elements */
+ {
+ .name = "sched_hpc",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SCHED_HPC,
+ },
+
+ {
+ .name = "sched_hpc_rt",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SCHED_HPCRT,
+ },
+#endif
+
{ } /* terminate */
};
@@ -2885,6 +3113,8 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
+ hpc_cpusets_disable();
+
return 0;
}
@@ -3596,3 +3826,34 @@ void cpuset_task_status_allowed(struct s
seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
nodemask_pr_args(&task->mems_allowed));
}
+
+#ifdef CONFIG_HPC_CPUSETS
+static int __init hpc_cpusets(char *str)
+{
+ hpc_cpusets_enabled = 1;
+
+ return 0;
+}
+early_param("hpc_cpusets", hpc_cpusets);
+
+static void __init hpc_cpusets_disable(void)
+{
+ struct cftype *cft;
+ int got = 0;
+
+ if (hpc_cpusets_enabled)
+ return;
+
+ /*
+ * Q: Why the fsck did you turn it _off_ like this?
+ * A: Turning it _on_ in hpc_cpusets() didn't work.
+ */
+ for (cft = &legacy_files[0]; cft->name[0] != '\0'; cft++) {
+ if (!got && !strcmp(cft->name, "sched_hpc"))
+ got = 1;
+ if (got)
+ cft->name[0] = '\0';
+ }
+}
+#endif
+
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7004,6 +7004,9 @@ void __init sched_init(void)
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
#endif
+#ifdef CONFIG_HPC_CPUSETS
+ rq->cpuset_flags = 0;
+#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
@@ -8227,6 +8230,87 @@ static void migrate_disabled_sched(struc
#endif
+#ifdef CONFIG_HPC_CPUSETS
+static int nr_hpc_cpus;
+
+/* Called with cpuset_mutex held */
+void cpuset_flags_set(int cpu, unsigned bits)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+ unsigned nr, bit;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ /* Set blocker flags before taking any action */
+ bits ^= rq->cpuset_flags;
+ rq->cpuset_flags |= bits;
+ for (nr = 0; bits; nr++) {
+ bit = 1 << nr;
+ if (!(bits & bit))
+ continue;
+ switch (nr) {
+ case RQ_TICK:
+ wake_up_idle_cpu(cpu);
+ break;
+ case RQ_HPC:
+ /* Ensure that jiffies doesn't go stale */
+ if (!nr_hpc_cpus++) {
+ tick_do_timer_cpu = 0;
+ /* safe, CPU0 is modifier excluded */
+ cpuset_flags_set(0, RQ_TICK);
+ }
+ break;
+ case RQ_HPCRT:
+ cpupri_set(&rq->rd->cpupri, cpu, CPUPRI_INVALID);
+ break;
+ }
+ bits &= ~bit;
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+/* Called with cpuset_mutex held */
+void cpuset_flags_clr(int cpu, unsigned bits)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+ unsigned nr, bit;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ bits &= rq->cpuset_flags;
+ rq->cpuset_flags &= ~bits;
+ for (nr = 0; bits; nr++) {
+ bit = 1 << nr;
+ if (!(bits & bit))
+ continue;
+ switch (nr) {
+ case RQ_TICK:
+ break;
+ case RQ_HPC:
+ /* Let CPU0 resume nohz mode */
+ if (nr_hpc_cpus && !--nr_hpc_cpus)
+ cpuset_flags_clr(0, RQ_TICK);
+ break;
+ case RQ_HPCRT:
+ cpupri_set(&rq->rd->cpupri, cpu, rq->rt.highest_prio.curr);
+ break;
+ }
+ bits &= ~bit;
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+int runqueue_is_isolated(int cpu)
+{
+ return !cpu_rq(cpu)->sd;
+}
+
+int runqueue_is_flagged(int cpu, unsigned flag)
+{
+ return rq_cpuset_flag(cpu_rq(cpu), flag);
+}
+#endif /* CONFIG_HPC_CPUSETS */
+
#ifndef CONFIG_PREEMPTION
static int __init setup_non_preempt(char *str)
{
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -502,7 +502,7 @@ static int push_dl_task(struct rq *rq);
static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
{
- return dl_task(prev);
+ return !rq_cpuset_flag(rq, RQ_HPCRT) && dl_task(prev);
}
static DEFINE_PER_CPU(struct callback_head, dl_push_head);
@@ -2361,6 +2361,9 @@ static void switched_from_dl(struct rq *
if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
return;
+ if (rq_cpuset_flag(rq, RQ_HPCRT))
+ return;
+
deadline_queue_pull_task(rq);
}
@@ -2382,6 +2385,8 @@ static void switched_to_dl(struct rq *rq
if (rq->curr != p) {
#ifdef CONFIG_SMP
+ if (rq_cpuset_flag(rq, RQ_HPCRT))
+ return;
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
deadline_queue_push_tasks(rq);
#endif
@@ -2407,7 +2412,7 @@ static void prio_changed_dl(struct rq *r
* we can't argue if the task is increasing
* or lowering its prio, so...
*/
- if (!rq->dl.overloaded)
+ if (!rq->dl.overloaded && !rq_cpuset_flag(rq, RQ_HPCRT))
deadline_queue_pull_task(rq);
/*
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -263,7 +263,7 @@ static void pull_rt_task(struct rq *this
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
- return rq->rt.highest_prio.curr > prev->prio;
+ return !rq_cpuset_flag(rq, RQ_HPCRT) && rq->rt.highest_prio.curr > prev->prio;
}
static inline int rt_overloaded(struct rq *rq)
@@ -1045,8 +1045,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int
if (&rq->rt != rt_rq)
return;
#endif
- if (rq->online && prio < prev_prio)
- cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
+ if (!rq->online || rq_cpuset_flag(rq, RQ_HPCRT))
+ return;
+
+ if (prio >= prev_prio)
+ return;
+
+ cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
}
static void
@@ -1061,8 +1066,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int
if (&rq->rt != rt_rq)
return;
#endif
- if (rq->online && rt_rq->highest_prio.curr != prev_prio)
- cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
+ if (!rq->online || rq_cpuset_flag(rq, RQ_HPCRT))
+ return;
+
+ if (rt_rq->highest_prio.curr == prev_prio)
+ return;
+
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
#else /* CONFIG_SMP */
@@ -2194,6 +2204,9 @@ static void switched_from_rt(struct rq *
if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
return;
+ if (rq_cpuset_flag(rq, RQ_HPCRT))
+ return;
+
rt_queue_pull_task(rq);
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -953,6 +953,9 @@ struct rq {
/* For active balancing */
int active_balance;
int push_cpu;
+#ifdef CONFIG_HPC_CPUSETS
+ unsigned int cpuset_flags;
+#endif
struct cpu_stop_work active_balance_work;
/* CPU of this runqueue: */
@@ -2557,3 +2560,16 @@ static inline bool is_per_cpu_kthread(st
return true;
}
#endif
+
+#ifdef CONFIG_HPC_CPUSETS
+extern int tick_do_timer_cpu __read_mostly;
+static inline int rq_cpuset_flag(struct rq *rq, unsigned flag)
+{
+ return rq->cpuset_flags & flag;
+}
+#ifndef CONFIG_NO_HZ
+static inline void wake_up_idle_cpu(int cpu) { }
+#endif
+#else /* !CONFIG_HPC_CPUSETS */
+static inline int rq_cpuset_flag(struct rq *rq, unsigned flag) { return 0; }
+#endif /* CONFIG_HPC_CPUSETS */