From b4e9ef0691cc48daab02e1e698b045adbdcb769e Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Wed, 6 Apr 2022 10:10:17 -0700
Subject: [PATCH] kernel/sched: Defer IPI sending to schedule points

The original design intent with arch_sched_ipi() was that
interprocessor interrupts were fast and easily sent, so to reduce
latency the scheduler should notify other CPUs synchronously when
scheduler state changes.

This tends to result in "storms" of IPIs in some use cases, though.
For example, SOF will enumerate over all cores doing a k_sem_give() to
notify a worker thread pinned to each, each call causing a separate
IPI.  Add to that the fact that unlike x86's IO-APIC, the intel_adsp
architecture has targeted/non-broadcast IPIs that need to be repeated
for each core, and suddenly we have an O(N^2) scaling problem in the
number of CPUs.

Instead, batch the "pending" IPIs and send them only at known
scheduling points (end-of-interrupt and swap).  This semantically
matches the locations where application code will "expect" to see
other threads run, so arguably is a better choice anyway.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 include/zephyr/kernel_structs.h |  5 ++++
 kernel/sched.c                  | 43 ++++++++++++++++++++++++++++++---
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/include/zephyr/kernel_structs.h b/include/zephyr/kernel_structs.h
index 8258a47f68a..52f630ba399 100644
--- a/include/zephyr/kernel_structs.h
+++ b/include/zephyr/kernel_structs.h
@@ -183,6 +183,11 @@ struct z_kernel {
 #if defined(CONFIG_THREAD_MONITOR)
 	struct k_thread *threads; /* singly linked list of ALL threads */
 #endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
+	/* Need to signal an IPI at the next scheduling point */
+	bool pending_ipi;
+#endif
 };
 
 typedef struct z_kernel _kernel_t;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6cc03a705db..6e4d97d8bee 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -269,6 +269,25 @@ static ALWAYS_INLINE void dequeue_thread(struct k_thread *thread)
 	}
 }
 
+static void signal_pending_ipi(void)
+{
+	/* Synchronization note: you might think we need to lock these
+	 * two steps, but an IPI is idempotent.  It's OK if we do it
+	 * twice.  All we require is that if a CPU sees the flag true,
+	 * it is guaranteed to send the IPI, and if a core sets
+	 * pending_ipi, the IPI will be sent the next time through
+	 * this code.
+	 */
+#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
+	if (CONFIG_MP_NUM_CPUS > 1) {
+		if (_kernel.pending_ipi) {
+			_kernel.pending_ipi = false;
+			arch_sched_ipi();
+		}
+	}
+#endif
+}
+
 #ifdef CONFIG_SMP
 /* Called out of z_swap() when CONFIG_SMP.  The current thread can
  * never live in the run queue until we are inexorably on the context
@@ -281,6 +300,7 @@ void z_requeue_current(struct k_thread *curr)
 	if (z_is_thread_queued(curr)) {
 		runq_add(curr);
 	}
+	signal_pending_ipi();
 }
 
 static inline bool is_aborting(struct k_thread *thread)
@@ -585,8 +605,10 @@ static bool thread_active_elsewhere(struct k_thread *thread)
 
 static void flag_ipi(void)
 {
-#if defined(CONFIG_SMP) &&  defined(CONFIG_SCHED_IPI_SUPPORTED)
-		arch_sched_ipi();
+#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
+	if (CONFIG_MP_NUM_CPUS > 1) {
+		_kernel.pending_ipi = true;
+	}
 #endif
 }
 
@@ -931,6 +953,7 @@ void z_reschedule(struct k_spinlock *lock, k_spinlock_key_t key)
 		z_swap(lock, key);
 	} else {
 		k_spin_unlock(lock, key);
+		signal_pending_ipi();
 	}
 }
 
@@ -940,6 +963,7 @@ void z_reschedule_irqlock(uint32_t key)
 		z_swap_irqlock(key);
 	} else {
 		irq_unlock(key);
+		signal_pending_ipi();
 	}
 }
 
@@ -973,7 +997,16 @@ void k_sched_unlock(void)
 struct k_thread *z_swap_next_thread(void)
 {
 #ifdef CONFIG_SMP
-	return next_up();
+	struct k_thread *ret = next_up();
+
+	if (ret == _current) {
+		/* When not swapping, have to signal IPIs here.  In
+		 * the context switch case it must happen later, after
+		 * _current gets requeued.
+		 */
+		signal_pending_ipi();
+	}
+	return ret;
 #else
 	return _kernel.ready_q.cache;
 #endif
@@ -1073,6 +1106,7 @@ void *z_get_next_switch_handle(void *interrupted)
 			new_thread->switch_handle = NULL;
 		}
 	}
+	signal_pending_ipi();
 	return ret;
 #else
 	z_sched_usage_switch(_kernel.ready_q.cache);
@@ -1684,6 +1718,9 @@ void z_thread_abort(struct k_thread *thread)
 		/* It's running somewhere else, flag and poke */
 		thread->base.thread_state |= _THREAD_ABORTING;
 
+		/* We're going to spin, so need a true synchronous IPI
+		 * here, not deferred!
+		 */
 #ifdef CONFIG_SCHED_IPI_SUPPORTED
 		arch_sched_ipi();
 #endif