Blob Blame History Raw
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sat, 27 May 2017 19:02:06 +0200
Subject: kernel/sched/core: add migrate_disable()
Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git
Git-commit: da77ceac3d20f27310a07a7c346a4ee6b40d6c28
Patch-mainline: Queued in subsystem maintainer repository
References: SLE Realtime Extension

[bristot@redhat.com: rt: Increase/decrease the nr of migratory tasks when enabling/disabling migration
 Link: https://lkml.kernel.org/r/e981d271cbeca975bca710e2fbcc6078c09741b0.1498482127.git.bristot@redhat.com
]
[swood@redhat.com: fixups and optimisations
 Link:https://lkml.kernel.org/r/20190727055638.20443-1-swood@redhat.com
 Link:https://lkml.kernel.org/r/20191012065214.28109-1-swood@redhat.com
]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Wagner <dwagner@suse.de>
---
 include/linux/preempt.h    |   32 +++++++
 include/linux/sched.h      |   35 ++++++++
 include/linux/smp.h        |    3 
 init/init_task.c           |    4 
 kernel/cpu.c               |   42 ++++++++++
 kernel/locking/rtmutex.c   |   12 ++
 kernel/locking/rwlock-rt.c |   18 +++-
 kernel/rcu/tree_plugin.h   |    6 +
 kernel/sched/core.c        |  181 ++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/debug.c       |    4 
 kernel/sched/sched.h       |    4 
 lib/smp_processor_id.c     |    5 +
 12 files changed, 336 insertions(+), 10 deletions(-)

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -201,6 +201,31 @@ do { \
 
 #define preemptible()	(preempt_count() == 0 && !irqs_disabled())
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+
+int __migrate_disabled(struct task_struct *p);
+
+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+static inline int __migrate_disabled(struct task_struct *p)
+{
+	return 0;
+}
+
+#else
+#define migrate_disable()		preempt_disable()
+#define migrate_enable()		preempt_enable()
+static inline int __migrate_disabled(struct task_struct *p)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_PREEMPTION
 #define preempt_enable() \
 do { \
@@ -270,6 +295,13 @@ do { \
 #define preempt_check_resched_rt()		barrier()
 #define preemptible()				0
 
+#define migrate_disable()			barrier()
+#define migrate_enable()			barrier()
+
+static inline int __migrate_disabled(struct task_struct *p)
+{
+	return 0;
+}
 #endif /* CONFIG_PREEMPT_COUNT */
 
 #ifdef MODULE
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -229,6 +229,8 @@ extern void io_schedule_finish(int token
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
+int cpu_nr_pinned(int cpu);
+
 /**
  * struct prev_cputime - snapshot of system and user cputime
  * @utime: time spent in user mode
@@ -705,6 +707,20 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	int				migrate_disable;
+	bool				migrate_disable_scheduled;
+# ifdef CONFIG_SCHED_DEBUG
+	int				pinned_on_cpu;
+# endif
+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+# ifdef CONFIG_SCHED_DEBUG
+	int				migrate_disable;
+# endif
+#endif
+#ifdef CONFIG_PREEMPT_RT
+	int				sleeping_lock;
+#endif
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
@@ -1866,6 +1882,23 @@ static __always_inline bool need_resched
 	return unlikely(tif_need_resched());
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static inline void sleeping_lock_inc(void)
+{
+	current->sleeping_lock++;
+}
+
+static inline void sleeping_lock_dec(void)
+{
+	current->sleeping_lock--;
+}
+
+#else
+
+static inline void sleeping_lock_inc(void) { }
+static inline void sleeping_lock_dec(void) { }
+#endif
+
 /*
  * Wrappers for p->thread_info->cpu access. No-op on UP.
  */
@@ -2055,4 +2088,6 @@ int sched_trace_rq_nr_running(struct rq
 
 const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
 
+extern struct task_struct *takedown_cpu_task;
+
 #endif
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -221,6 +221,9 @@ static inline int get_boot_cpu_id(void)
 #define get_cpu()		({ preempt_disable(); __smp_processor_id(); })
 #define put_cpu()		preempt_enable()
 
+#define get_cpu_light()		({ migrate_disable(); __smp_processor_id(); })
+#define put_cpu_light()		migrate_enable()
+
 /*
  * Callback to arch code if there's nosmp or maxcpus=0 on the
  * boot command line:
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -73,6 +73,10 @@ struct task_struct init_task
 	.cpus_ptr	= &init_task.cpus_mask,
 	.cpus_mask	= CPU_MASK_ALL,
 	.nr_cpus_allowed= NR_CPUS,
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) && \
+    defined(CONFIG_SCHED_DEBUG)
+	.pinned_on_cpu	= -1,
+#endif
 	.mm		= NULL,
 	.active_mm	= &init_mm,
 	.restart_block	= {
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -849,6 +849,15 @@ static int take_cpu_down(void *_param)
 	int err, cpu = smp_processor_id();
 	int ret;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * If any tasks disabled migration before we got here,
+	 * go back and sleep again.
+	 */
+	if (cpu_nr_pinned(cpu))
+		return -EAGAIN;
+#endif
+
 	/* Ensure this CPU doesn't handle any more interrupts. */
 	err = __cpu_disable();
 	if (err < 0)
@@ -878,6 +887,8 @@ static int take_cpu_down(void *_param)
 	return 0;
 }
 
+struct task_struct *takedown_cpu_task;
+
 static int takedown_cpu(unsigned int cpu)
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -892,11 +903,39 @@ static int takedown_cpu(unsigned int cpu
 	 */
 	irq_lock_sparse();
 
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON_ONCE(takedown_cpu_task);
+	takedown_cpu_task = current;
+
+again:
+	/*
+	 * If a task pins this CPU after we pass this check, take_cpu_down
+	 * will return -EAGAIN.
+	 */
+	for (;;) {
+		int nr_pinned;
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		nr_pinned = cpu_nr_pinned(cpu);
+		if (nr_pinned == 0)
+			break;
+		schedule();
+	}
+	set_current_state(TASK_RUNNING);
+#endif
+
 	/*
 	 * So now all preempt/rcu users must observe !cpu_active().
 	 */
 	err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
+#ifdef CONFIG_PREEMPT_RT
+	if (err == -EAGAIN)
+		goto again;
+#endif
 	if (err) {
+#ifdef CONFIG_PREEMPT_RT
+		takedown_cpu_task = NULL;
+#endif
 		/* CPU refused to die */
 		irq_unlock_sparse();
 		/* Unpark the hotplug thread so we can rollback there */
@@ -915,6 +954,9 @@ static int takedown_cpu(unsigned int cpu
 	wait_for_ap_thread(st, false);
 	BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
 
+#ifdef CONFIG_PREEMPT_RT
+	takedown_cpu_task = NULL;
+#endif
 	/* Interrupts are moved away from the dying cpu, reenable alloc/free */
 	irq_unlock_sparse();
 
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1140,6 +1140,7 @@ void __sched rt_spin_lock_slowunlock(str
 
 void __lockfunc rt_spin_lock(spinlock_t *lock)
 {
+	sleeping_lock_inc();
 	migrate_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
@@ -1154,6 +1155,7 @@ void __lockfunc __rt_spin_lock(struct rt
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
 {
+	sleeping_lock_inc();
 	migrate_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
 	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
@@ -1166,6 +1168,7 @@ void __lockfunc rt_spin_unlock(spinlock_
 	spin_release(&lock->dep_map, _RET_IP_);
 	rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
 	migrate_enable();
+	sleeping_lock_dec();
 }
 EXPORT_SYMBOL(rt_spin_unlock);
 
@@ -1191,12 +1194,15 @@ int __lockfunc rt_spin_trylock(spinlock_
 {
 	int ret;
 
+	sleeping_lock_inc();
 	migrate_disable();
 	ret = __rt_mutex_trylock(&lock->lock);
-	if (ret)
+	if (ret) {
 		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-	else
+	} else {
 		migrate_enable();
+		sleeping_lock_dec();
+	}
 	return ret;
 }
 EXPORT_SYMBOL(rt_spin_trylock);
@@ -1208,6 +1214,7 @@ int __lockfunc rt_spin_trylock_bh(spinlo
 	local_bh_disable();
 	ret = __rt_mutex_trylock(&lock->lock);
 	if (ret) {
+		sleeping_lock_inc();
 		migrate_disable();
 		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 	} else
@@ -1223,6 +1230,7 @@ int __lockfunc rt_spin_trylock_irqsave(s
 	*flags = 0;
 	ret = __rt_mutex_trylock(&lock->lock);
 	if (ret) {
+		sleeping_lock_inc();
 		migrate_disable();
 		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 	}
--- a/kernel/locking/rwlock-rt.c
+++ b/kernel/locking/rwlock-rt.c
@@ -305,12 +305,15 @@ int __lockfunc rt_read_trylock(rwlock_t
 {
 	int ret;
 
+	sleeping_lock_inc();
 	migrate_disable();
 	ret = do_read_rt_trylock(rwlock);
-	if (ret)
+	if (ret) {
 		rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
-	else
+	} else {
 		migrate_enable();
+		sleeping_lock_dec();
+	}
 	return ret;
 }
 EXPORT_SYMBOL(rt_read_trylock);
@@ -319,18 +322,22 @@ int __lockfunc rt_write_trylock(rwlock_t
 {
 	int ret;
 
+	sleeping_lock_inc();
 	migrate_disable();
 	ret = do_write_rt_trylock(rwlock);
-	if (ret)
+	if (ret) {
 		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
-	else
+	} else {
 		migrate_enable();
+		sleeping_lock_dec();
+	}
 	return ret;
 }
 EXPORT_SYMBOL(rt_write_trylock);
 
 void __lockfunc rt_read_lock(rwlock_t *rwlock)
 {
+	sleeping_lock_inc();
 	migrate_disable();
 	rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
 	do_read_rt_lock(rwlock);
@@ -339,6 +346,7 @@ EXPORT_SYMBOL(rt_read_lock);
 
 void __lockfunc rt_write_lock(rwlock_t *rwlock)
 {
+	sleeping_lock_inc();
 	migrate_disable();
 	rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
 	do_write_rt_lock(rwlock);
@@ -350,6 +358,7 @@ void __lockfunc rt_read_unlock(rwlock_t
 	rwlock_release(&rwlock->dep_map, _RET_IP_);
 	do_read_rt_unlock(rwlock);
 	migrate_enable();
+	sleeping_lock_dec();
 }
 EXPORT_SYMBOL(rt_read_unlock);
 
@@ -358,6 +367,7 @@ void __lockfunc rt_write_unlock(rwlock_t
 	rwlock_release(&rwlock->dep_map, _RET_IP_);
 	do_write_rt_unlock(rwlock);
 	migrate_enable();
+	sleeping_lock_dec();
 }
 EXPORT_SYMBOL(rt_write_unlock);
 
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -287,11 +287,15 @@ void rcu_note_context_switch(bool preemp
 	struct task_struct *t = current;
 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 	struct rcu_node *rnp;
+	int sleeping_l = 0;
 
 	barrier(); /* Avoid RCU read-side critical sections leaking down. */
 	trace_rcu_utilization(TPS("Start context switch"));
 	lockdep_assert_irqs_disabled();
-	WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
+#if defined(CONFIG_PREEMPT_RT)
+	sleeping_l = t->sleeping_lock;
+#endif
+	WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l);
 	if (t->rcu_read_lock_nesting > 0 &&
 	    !t->rcu_read_unlock_special.b.blocked) {
 
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1465,7 +1465,7 @@ static inline bool is_cpu_allowed(struct
 	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 		return false;
 
-	if (is_per_cpu_kthread(p))
+	if (is_per_cpu_kthread(p) || __migrate_disabled(p))
 		return cpu_online(cpu);
 
 	return cpu_active(cpu);
@@ -1587,9 +1587,18 @@ static int migration_cpu_stop(void *data
 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
 {
 	cpumask_copy(&p->cpus_mask, new_mask);
-	p->nr_cpus_allowed = cpumask_weight(new_mask);
+	if (p->cpus_ptr == &p->cpus_mask)
+		p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+int __migrate_disabled(struct task_struct *p)
+{
+	return p->migrate_disable;
+}
+EXPORT_SYMBOL_GPL(__migrate_disabled);
+#endif
+
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
 	struct rq *rq = task_rq(p);
@@ -1683,7 +1692,8 @@ static int __set_cpus_allowed_ptr(struct
 	}
 
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
+	if (cpumask_test_cpu(task_cpu(p), new_mask) ||
+	    p->cpus_ptr != &p->cpus_mask)
 		goto out;
 
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
@@ -4123,6 +4133,8 @@ restart:
 	BUG();
 }
 
+static void migrate_disabled_sched(struct task_struct *p);
+
 /*
  * __schedule() is the main scheduler function.
  *
@@ -4201,6 +4213,9 @@ static void __sched notrace __schedule(b
 	rq_lock(rq, &rf);
 	smp_mb__after_spinlock();
 
+	if (__migrate_disabled(prev))
+		migrate_disabled_sched(prev);
+
 	/* Promote REQ to ACT */
 	rq->clock_update_flags <<= 1;
 	update_rq_clock(rq);
@@ -6481,6 +6496,7 @@ static void migrate_tasks(struct rq *dea
 			break;
 
 		next = __pick_migrate_task(rq);
+		WARN_ON_ONCE(__migrate_disabled(next));
 
 		/*
 		 * Rules for changing task_struct::cpus_mask are holding
@@ -7960,6 +7976,165 @@ void call_trace_sched_update_nr_running(
         trace_sched_update_nr_running_tp(rq, count);
 }
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+
+static inline void
+update_nr_migratory(struct task_struct *p, long delta)
+{
+	if (unlikely((p->sched_class == &rt_sched_class ||
+		      p->sched_class == &dl_sched_class) &&
+		      p->nr_cpus_allowed > 1)) {
+		if (p->sched_class == &rt_sched_class)
+			task_rq(p)->rt.rt_nr_migratory += delta;
+		else
+			task_rq(p)->dl.dl_nr_migratory += delta;
+	}
+}
+
+static inline void
+migrate_disable_update_cpus_allowed(struct task_struct *p)
+{
+	p->cpus_ptr = cpumask_of(smp_processor_id());
+	update_nr_migratory(p, -1);
+	p->nr_cpus_allowed = 1;
+}
+
+static inline void
+migrate_enable_update_cpus_allowed(struct task_struct *p)
+{
+	struct rq *rq;
+	struct rq_flags rf;
+
+	rq = task_rq_lock(p, &rf);
+	p->cpus_ptr = &p->cpus_mask;
+	p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
+	update_nr_migratory(p, 1);
+	task_rq_unlock(rq, p, &rf);
+}
+
+void migrate_disable(void)
+{
+	preempt_disable();
+
+	if (++current->migrate_disable == 1) {
+		this_rq()->nr_pinned++;
+#ifdef CONFIG_SCHED_DEBUG
+		WARN_ON_ONCE(current->pinned_on_cpu >= 0);
+		current->pinned_on_cpu = smp_processor_id();
+#endif
+	}
+
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+static void migrate_disabled_sched(struct task_struct *p)
+{
+	if (p->migrate_disable_scheduled)
+		return;
+
+	migrate_disable_update_cpus_allowed(p);
+	p->migrate_disable_scheduled = 1;
+}
+
+void migrate_enable(void)
+{
+	struct task_struct *p = current;
+	struct rq *rq = this_rq();
+	int cpu = task_cpu(p);
+
+	WARN_ON_ONCE(p->migrate_disable <= 0);
+	if (p->migrate_disable > 1) {
+		p->migrate_disable--;
+		return;
+	}
+
+	preempt_disable();
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(current->pinned_on_cpu != cpu);
+	current->pinned_on_cpu = -1;
+#endif
+
+	WARN_ON_ONCE(rq->nr_pinned < 1);
+
+	p->migrate_disable = 0;
+	rq->nr_pinned--;
+	if (rq->nr_pinned == 0 && unlikely(!cpu_active(cpu)) &&
+	    takedown_cpu_task)
+		wake_up_process(takedown_cpu_task);
+
+	if (!p->migrate_disable_scheduled)
+		goto out;
+
+	p->migrate_disable_scheduled = 0;
+
+	migrate_enable_update_cpus_allowed(p);
+
+	WARN_ON(smp_processor_id() != cpu);
+	if (!is_cpu_allowed(p, cpu)) {
+		struct migration_arg arg = { p };
+		struct rq_flags rf;
+
+		rq = task_rq_lock(p, &rf);
+		update_rq_clock(rq);
+		arg.dest_cpu = select_fallback_rq(cpu, p);
+		task_rq_unlock(rq, p, &rf);
+
+		preempt_enable();
+
+		sleeping_lock_inc();
+		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
+		sleeping_lock_dec();
+		return;
+
+	}
+
+out:
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_enable);
+
+int cpu_nr_pinned(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	return rq->nr_pinned;
+}
+
+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+static void migrate_disabled_sched(struct task_struct *p)
+{
+}
+
+void migrate_disable(void)
+{
+#ifdef CONFIG_SCHED_DEBUG
+	current->migrate_disable++;
+#endif
+	barrier();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+void migrate_enable(void)
+{
+#ifdef CONFIG_SCHED_DEBUG
+	struct task_struct *p = current;
+
+	WARN_ON_ONCE(p->migrate_disable <= 0);
+	p->migrate_disable--;
+#endif
+	barrier();
+}
+EXPORT_SYMBOL(migrate_enable);
+
+#else
+static void migrate_disabled_sched(struct task_struct *p)
+{
+}
+
+#endif
+
 #ifndef CONFIG_PREEMPTION
 static int __init setup_non_preempt(char *str)
 {
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -955,6 +955,10 @@ void proc_sched_show_task(struct task_st
 		P(dl.runtime);
 		P(dl.deadline);
 	}
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	P(migrate_disable);
+#endif
+	P(nr_cpus_allowed);
 #undef PN_SCHEDSTAT
 #undef PN
 #undef __PN
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1021,6 +1021,10 @@ struct rq {
 	/* Must be inspected within a rcu lock section */
 	struct cpuidle_state	*idle_state;
 #endif
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	int			nr_pinned;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -23,6 +23,11 @@ unsigned int check_preemption_disabled(c
 	 * Kernel threads bound to a single CPU can safely use
 	 * smp_processor_id():
 	 */
+#if defined(CONFIG_PREEMPT_RT) && (defined(CONFIG_SMP) || defined(CONFIG_SCHED_DEBUG))
+	if (current->migrate_disable)
+		goto out;
+#endif
+
 	if (current->nr_cpus_allowed == 1)
 		goto out;