kernel/sched: Defer IPI sending to schedule points

The original design intent with arch_sched_ipi() was that interprocessor interrupts were fast and easily sent, so to reduce latency the scheduler should notify other CPUs synchronously when scheduler state changes. This tends to result in "storms" of IPIs in some use cases, though. For example, SOF will enumerate over all cores doing a k_sem_give() to notify a worker thread pinned to each, each call causing a separate IPI. Add to that the fact that unlike x86's IO-APIC, the intel_adsp architecture has targeted/non-broadcast IPIs that need to be repeated for each core, and suddenly we have an O(N^2) scaling problem in the number of CPUs. Instead, batch the "pending" IPIs and send them only at known scheduling points (end-of-interrupt and swap). This semantically matches the locations where application code will "expect" to see other threads run, so arguably is a better choice anyway. Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
2022-04-06 10:10:17 -07:00 · 2022-04-06 10:10:17 -07:00 · b4e9ef0691
commit b4e9ef0691
parent 3267cd327e
2 changed files with 45 additions and 3 deletions
--- a/include/zephyr/kernel_structs.h
+++ b/include/zephyr/kernel_structs.h
@ -183,6 +183,11 @@ struct z_kernel {
 #if defined(CONFIG_THREAD_MONITOR)
 	struct k_thread *threads; /* singly linked list of ALL threads */
 #endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
+	/* Need to signal an IPI at the next scheduling point */
+	bool pending_ipi;
+#endif
 };

 typedef struct z_kernel _kernel_t;
--- a/kernel/sched.c
+++ b/kernel/sched.c
@ -269,6 +269,25 @@ static ALWAYS_INLINE void dequeue_thread(struct k_thread *thread)
 	}
 }

+static void signal_pending_ipi(void)
+{
+	/* Synchronization note: you might think we need to lock these
+	 * two steps, but an IPI is idempotent.  It's OK if we do it
+	 * twice.  All we require is that if a CPU sees the flag true,
+	 * it is guaranteed to send the IPI, and if a core sets
+	 * pending_ipi, the IPI will be sent the next time through
+	 * this code.
+	 */
+#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
+	if (CONFIG_MP_NUM_CPUS > 1) {
+		if (_kernel.pending_ipi) {
+			_kernel.pending_ipi = false;
+			arch_sched_ipi();
+		}
+	}
+#endif
+}
+
 #ifdef CONFIG_SMP
 /* Called out of z_swap() when CONFIG_SMP.  The current thread can
 * never live in the run queue until we are inexorably on the context
@ -281,6 +300,7 @@ void z_requeue_current(struct k_thread *curr)
 	if (z_is_thread_queued(curr)) {
 		runq_add(curr);
 	}
+	signal_pending_ipi();
 }

 static inline bool is_aborting(struct k_thread *thread)
@ -585,8 +605,10 @@ static bool thread_active_elsewhere(struct k_thread *thread)

 static void flag_ipi(void)
 {
-#if defined(CONFIG_SMP) &&  defined(CONFIG_SCHED_IPI_SUPPORTED)
-		arch_sched_ipi();
+#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
+	if (CONFIG_MP_NUM_CPUS > 1) {
+		_kernel.pending_ipi = true;
+	}
 #endif
 }

@ -931,6 +953,7 @@ void z_reschedule(struct k_spinlock *lock, k_spinlock_key_t key)
 		z_swap(lock, key);
 	} else {
 		k_spin_unlock(lock, key);
+		signal_pending_ipi();
 	}
 }

@ -940,6 +963,7 @@ void z_reschedule_irqlock(uint32_t key)
 		z_swap_irqlock(key);
 	} else {
 		irq_unlock(key);
+		signal_pending_ipi();
 	}
 }

@ -973,7 +997,16 @@ void k_sched_unlock(void)
 struct k_thread *z_swap_next_thread(void)
 {
 #ifdef CONFIG_SMP
-	return next_up();
+	struct k_thread *ret = next_up();
+
+	if (ret == _current) {
+		/* When not swapping, have to signal IPIs here.  In
+		 * the context switch case it must happen later, after
+		 * _current gets requeued.
+		 */
+		signal_pending_ipi();
+	}
+	return ret;
 #else
 	return _kernel.ready_q.cache;
 #endif
@ -1073,6 +1106,7 @@ void *z_get_next_switch_handle(void *interrupted)
 			new_thread->switch_handle = NULL;
 		}
 	}
+	signal_pending_ipi();
 	return ret;
 #else
 	z_sched_usage_switch(_kernel.ready_q.cache);
@ -1684,6 +1718,9 @@ void z_thread_abort(struct k_thread *thread)
 		/* It's running somewhere else, flag and poke */
 		thread->base.thread_state |= _THREAD_ABORTING;

+		/* We're going to spin, so need a true synchronous IPI
+		 * here, not deferred!
+		 */
 #ifdef CONFIG_SCHED_IPI_SUPPORTED
 		arch_sched_ipi();
 #endif