It was possible with pathological timing (see below) for the scheduler
to pick a cycle of threads on each CPU and enter the context switch
path on all of them simultaneously.
Example:
* CPU0 is idle, CPU1 is running thread A
* CPU1 makes high priority thread B runnable
* CPU1 reaches a schedule point (or returns from an interrupt) and
decides to run thread B instead
* CPU0 simultaneously takes its IPI and returns, selecting thread A
Now both CPUs enter wait_for_switch() to spin, waiting for the context
switch code on the other thread to finish and mark the thread
runnable. So we have a deadlock, each CPU is spinning waiting for the
other!
Actually, in practice this seems not to happen on existing hardware
platforms, it's only exercisable in emulation. The reason is that the
hardware IPI time is much faster than the software paths required to
reach a schedule point or interrupt exit, so CPU1 always selects the
newly scheduled thread and no deadlock appears. I tried for a bit to
make this happen with a cycle of three threads, but it's complicated
to get right and I still couldn't get the timing to hit correctly. In
qemu, though, the IPI is implemented as a Unix signal sent to the
thread running the other CPU, which is far slower and opens the window
to see this happen.
The solution is simple enough: don't store the _current thread in the
run queue until we are on the tail end of the context switch path,
after wait_for_switch() and going to reach the end in guaranteed time.
Note that this requires changing a little logic to handle the yield
case: because we can no longer rely on _current's position in the run
queue to suppress it, we need to do the priority comparison directly
based on the existing "swap_ok" flag (which has always meant
"yielded", and maybe should be renamed).
Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
195 lines
4.9 KiB
C
195 lines
4.9 KiB
C
/*
|
|
* Copyright (c) 2018 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
#ifndef ZEPHYR_KERNEL_INCLUDE_KSWAP_H_
|
|
#define ZEPHYR_KERNEL_INCLUDE_KSWAP_H_
|
|
|
|
#include <ksched.h>
|
|
#include <spinlock.h>
|
|
#include <kernel_arch_func.h>
|
|
|
|
#ifdef CONFIG_STACK_SENTINEL
|
|
extern void z_check_stack_sentinel(void);
|
|
#else
|
|
#define z_check_stack_sentinel() /**/
|
|
#endif
|
|
|
|
/* In SMP, the irq_lock() is a spinlock which is implicitly released
|
|
* and reacquired on context switch to preserve the existing
|
|
* semantics. This means that whenever we are about to return to a
|
|
* thread (via either z_swap() or interrupt/exception return!) we need
|
|
* to restore the lock state to whatever the thread's counter
|
|
* expects.
|
|
*/
|
|
void z_smp_reacquire_global_lock(struct k_thread *thread);
|
|
void z_smp_release_global_lock(struct k_thread *thread);
|
|
|
|
/* context switching and scheduling-related routines */
|
|
#ifdef CONFIG_USE_SWITCH
|
|
|
|
/* There is an unavoidable SMP race when threads swap -- their thread
|
|
* record is in the queue (and visible to other CPUs) before
|
|
* arch_switch() finishes saving state. We must spin for the switch
|
|
* handle before entering a new thread. See docs on arch_switch().
|
|
*
|
|
* Note: future SMP architectures may need a fence/barrier or cache
|
|
* invalidation here. Current ones don't, and sadly Zephyr doesn't
|
|
* have a framework for that yet.
|
|
*/
|
|
static inline void wait_for_switch(struct k_thread *thread)
|
|
{
|
|
#ifdef CONFIG_SMP
|
|
volatile void **shp = (void *)&thread->switch_handle;
|
|
|
|
while (*shp == NULL) {
|
|
k_busy_wait(1);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/* New style context switching. arch_switch() is a lower level
|
|
* primitive that doesn't know about the scheduler or return value.
|
|
* Needed for SMP, where the scheduler requires spinlocking that we
|
|
* don't want to have to do in per-architecture assembly.
|
|
*
|
|
* Note that is_spinlock is a compile-time construct which will be
|
|
* optimized out when this function is expanded.
|
|
*/
|
|
static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
|
|
struct k_spinlock *lock,
|
|
int is_spinlock)
|
|
{
|
|
ARG_UNUSED(lock);
|
|
struct k_thread *new_thread, *old_thread;
|
|
|
|
old_thread = _current;
|
|
|
|
z_check_stack_sentinel();
|
|
|
|
if (is_spinlock && lock != NULL) {
|
|
k_spin_release(lock);
|
|
}
|
|
|
|
new_thread = z_get_next_ready_thread();
|
|
|
|
if (new_thread != old_thread) {
|
|
#ifdef CONFIG_TIMESLICING
|
|
z_reset_time_slice();
|
|
#endif
|
|
|
|
old_thread->swap_retval = -EAGAIN;
|
|
|
|
#ifdef CONFIG_SMP
|
|
_current_cpu->swap_ok = 0;
|
|
new_thread->base.cpu = arch_curr_cpu()->id;
|
|
|
|
if (!is_spinlock) {
|
|
z_smp_release_global_lock(new_thread);
|
|
}
|
|
#endif
|
|
z_thread_mark_switched_out();
|
|
wait_for_switch(new_thread);
|
|
arch_cohere_stacks(old_thread, NULL, new_thread);
|
|
_current_cpu->current = new_thread;
|
|
|
|
#ifdef CONFIG_SMP
|
|
/* Add _current back to the run queue HERE. After
|
|
* wait_for_switch() we are guaranteed to reach the
|
|
* context switch in finite time, avoiding a potential
|
|
* deadlock.
|
|
*/
|
|
z_requeue_current(old_thread);
|
|
#endif
|
|
|
|
void *newsh = new_thread->switch_handle;
|
|
|
|
if (IS_ENABLED(CONFIG_SMP)) {
|
|
/* Active threads MUST have a null here */
|
|
new_thread->switch_handle = NULL;
|
|
}
|
|
arch_switch(newsh, &old_thread->switch_handle);
|
|
}
|
|
|
|
if (is_spinlock) {
|
|
arch_irq_unlock(key);
|
|
} else {
|
|
irq_unlock(key);
|
|
}
|
|
|
|
return _current->swap_retval;
|
|
}
|
|
|
|
static inline int z_swap_irqlock(unsigned int key)
|
|
{
|
|
return do_swap(key, NULL, 0);
|
|
}
|
|
|
|
static inline int z_swap(struct k_spinlock *lock, k_spinlock_key_t key)
|
|
{
|
|
return do_swap(key.key, lock, 1);
|
|
}
|
|
|
|
static inline void z_swap_unlocked(void)
|
|
{
|
|
(void) do_swap(arch_irq_lock(), NULL, 1);
|
|
}
|
|
|
|
#else /* !CONFIG_USE_SWITCH */
|
|
|
|
extern int arch_swap(unsigned int key);
|
|
|
|
static inline int z_swap_irqlock(unsigned int key)
|
|
{
|
|
int ret;
|
|
z_check_stack_sentinel();
|
|
ret = arch_swap(key);
|
|
return ret;
|
|
}
|
|
|
|
/* If !USE_SWITCH, then spinlocks are guaranteed degenerate as we
|
|
* can't be in SMP. The k_spin_release() call is just for validation
|
|
* handling.
|
|
*/
|
|
static ALWAYS_INLINE int z_swap(struct k_spinlock *lock, k_spinlock_key_t key)
|
|
{
|
|
k_spin_release(lock);
|
|
return z_swap_irqlock(key.key);
|
|
}
|
|
|
|
static inline void z_swap_unlocked(void)
|
|
{
|
|
(void) z_swap_irqlock(arch_irq_lock());
|
|
}
|
|
|
|
#endif /* !CONFIG_USE_SWITCH */
|
|
|
|
/**
|
|
* Set up a "dummy" thread, used at early initialization to launch the
|
|
* first thread on a CPU.
|
|
*
|
|
* Needs to set enough fields such that the context switching code can
|
|
* use it to properly store state, which will just be discarded.
|
|
*
|
|
* The memory of the dummy thread can be completely uninitialized.
|
|
*/
|
|
static inline void z_dummy_thread_init(struct k_thread *dummy_thread)
|
|
{
|
|
dummy_thread->base.thread_state = _THREAD_DUMMY;
|
|
#ifdef CONFIG_SCHED_CPU_MASK
|
|
dummy_thread->base.cpu_mask = -1;
|
|
#endif
|
|
dummy_thread->base.user_options = K_ESSENTIAL;
|
|
#ifdef CONFIG_THREAD_STACK_INFO
|
|
dummy_thread->stack_info.start = 0U;
|
|
dummy_thread->stack_info.size = 0U;
|
|
#endif
|
|
#ifdef CONFIG_USERSPACE
|
|
dummy_thread->mem_domain_info.mem_domain = &k_mem_domain_default;
|
|
#endif
|
|
|
|
_current_cpu->current = dummy_thread;
|
|
}
|
|
#endif /* ZEPHYR_KERNEL_INCLUDE_KSWAP_H_ */
|