kernel/sched: Defer IPI sending to schedule points

The original design intent with arch_sched_ipi() was that
interprocessor interrupts were fast and easily sent, so to reduce
latency the scheduler should notify other CPUs synchronously when
scheduler state changes.

This tends to result in "storms" of IPIs in some use cases, though.
For example, SOF will enumerate over all cores doing a k_sem_give() to
notify a worker thread pinned to each, each call causing a separate
IPI.  Add to that the fact that unlike x86's IO-APIC, the intel_adsp
architecture has targeted/non-broadcast IPIs that need to be repeated
for each core, and suddenly we have an O(N^2) scaling problem in the
number of CPUs.

Instead, batch the "pending" IPIs and send them only at known
scheduling points (end-of-interrupt and swap).  This semantically
matches the locations where application code will "expect" to see
other threads run, so arguably is a better choice anyway.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
This commit is contained in:
Andy Ross 2022-04-06 10:10:17 -07:00 committed by Maureen Helm
parent 3267cd327e
commit b4e9ef0691
2 changed files with 45 additions and 3 deletions

View File

@ -183,6 +183,11 @@ struct z_kernel {
#if defined(CONFIG_THREAD_MONITOR)
struct k_thread *threads; /* singly linked list of ALL threads */
#endif
#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
/* Need to signal an IPI at the next scheduling point */
bool pending_ipi;
#endif
};
typedef struct z_kernel _kernel_t;

View File

@ -269,6 +269,25 @@ static ALWAYS_INLINE void dequeue_thread(struct k_thread *thread)
}
}
static void signal_pending_ipi(void)
{
/* Synchronization note: you might think we need to lock these
* two steps, but an IPI is idempotent. It's OK if we do it
* twice. All we require is that if a CPU sees the flag true,
* it is guaranteed to send the IPI, and if a core sets
* pending_ipi, the IPI will be sent the next time through
* this code.
*/
#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
if (CONFIG_MP_NUM_CPUS > 1) {
if (_kernel.pending_ipi) {
_kernel.pending_ipi = false;
arch_sched_ipi();
}
}
#endif
}
#ifdef CONFIG_SMP
/* Called out of z_swap() when CONFIG_SMP. The current thread can
* never live in the run queue until we are inexorably on the context
@ -281,6 +300,7 @@ void z_requeue_current(struct k_thread *curr)
if (z_is_thread_queued(curr)) {
runq_add(curr);
}
signal_pending_ipi();
}
static inline bool is_aborting(struct k_thread *thread)
@ -585,8 +605,10 @@ static bool thread_active_elsewhere(struct k_thread *thread)
static void flag_ipi(void)
{
#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
arch_sched_ipi();
#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
if (CONFIG_MP_NUM_CPUS > 1) {
_kernel.pending_ipi = true;
}
#endif
}
@ -931,6 +953,7 @@ void z_reschedule(struct k_spinlock *lock, k_spinlock_key_t key)
z_swap(lock, key);
} else {
k_spin_unlock(lock, key);
signal_pending_ipi();
}
}
@ -940,6 +963,7 @@ void z_reschedule_irqlock(uint32_t key)
z_swap_irqlock(key);
} else {
irq_unlock(key);
signal_pending_ipi();
}
}
@ -973,7 +997,16 @@ void k_sched_unlock(void)
struct k_thread *z_swap_next_thread(void)
{
#ifdef CONFIG_SMP
return next_up();
struct k_thread *ret = next_up();
if (ret == _current) {
/* When not swapping, have to signal IPIs here. In
* the context switch case it must happen later, after
* _current gets requeued.
*/
signal_pending_ipi();
}
return ret;
#else
return _kernel.ready_q.cache;
#endif
@ -1073,6 +1106,7 @@ void *z_get_next_switch_handle(void *interrupted)
new_thread->switch_handle = NULL;
}
}
signal_pending_ipi();
return ret;
#else
z_sched_usage_switch(_kernel.ready_q.cache);
@ -1684,6 +1718,9 @@ void z_thread_abort(struct k_thread *thread)
/* It's running somewhere else, flag and poke */
thread->base.thread_state |= _THREAD_ABORTING;
/* We're going to spin, so need a true synchronous IPI
* here, not deferred!
*/
#ifdef CONFIG_SCHED_IPI_SUPPORTED
arch_sched_ipi();
#endif