/* * Copyright (c) 2019 Intel Corporation * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include /* This is a scheduler microbenchmark, designed to measure latencies * of specific low level scheduling primitives independent of overhead * from application or API abstractions. It works very simply: a main * thread creates a "partner" thread at a higher priority, the partner * then sleeps using z_pend_curr_irqlock(). From this initial * state: * * 1. The main thread calls z_unpend_first_thread() * 2. The main thread calls z_ready_thread() * 3. The main thread calls k_yield() * (the kernel switches to the partner thread) * 4. The partner thread then runs and calls z_pend_curr_irqlock() again * (the kernel switches to the main thread) * 5. The main thread returns from k_yield() * * It then iterates this many times, reporting timestamp latencies * between each numbered step and for the whole cycle, and a running * average for all cycles run. */ #define N_RUNS 1000 #define N_SETTLE 10 static K_THREAD_STACK_DEFINE(partner_stack, 1024); static struct k_thread partner_thread; _wait_q_t waitq; enum { UNPENDING, UNPENDED_READYING, READIED_YIELDING, PARTNER_AWAKE_PENDING, YIELDED, NUM_STAMP_STATES }; uint32_t stamps[NUM_STAMP_STATES]; static inline int _stamp(int state) { uint32_t t; /* In theory the TSC has much lower overhead and higher * precision. In practice it's VERY jittery in recent qemu * versions and frankly too noisy to trust. */ #ifdef CONFIG_X86 __asm__ volatile("rdtsc" : "=a"(t) : : "edx"); #else t = k_cycle_get_32(); #endif stamps[state] = t; return t; } /* #define stamp(s) printk("%s @ %d\n", #s, _stamp(s)) */ #define stamp(s) _stamp(s) static void partner_fn(void *arg1, void *arg2, void *arg3) { ARG_UNUSED(arg1); ARG_UNUSED(arg2); ARG_UNUSED(arg3); printk("Running %p\n", k_current_get()); while (true) { unsigned int key = irq_lock(); z_pend_curr_irqlock(key, &waitq, K_FOREVER); stamp(PARTNER_AWAKE_PENDING); } } int main(void) { z_waitq_init(&waitq); int main_prio = k_thread_priority_get(k_current_get()); int partner_prio = main_prio - 1; k_tid_t th = k_thread_create(&partner_thread, partner_stack, K_THREAD_STACK_SIZEOF(partner_stack), partner_fn, NULL, NULL, NULL, partner_prio, 0, K_NO_WAIT); /* Let it start running and pend */ k_sleep(K_MSEC(100)); uint64_t tot = 0U; uint32_t runs = 0U; for (int i = 0; i < N_RUNS + N_SETTLE; i++) { stamp(UNPENDING); z_unpend_first_thread(&waitq); stamp(UNPENDED_READYING); z_ready_thread(th); stamp(READIED_YIELDING); /* z_ready_thread() does not reschedule, so this is * guaranteed to be the point where we will yield to * the new thread, which (being higher priority) will * run immediately, and we'll wake up synchronously as * soon as it pends. */ k_yield(); stamp(YIELDED); uint32_t avg, whole = stamps[4] - stamps[0]; if (++runs > N_SETTLE) { /* Only compute averages after the first ~10 * runs to let performance settle, cache * effects in the host pollute the early * data */ tot += whole; avg = tot / (runs - 10); } else { tot = 0U; avg = 0U; } /* For reference, an unmodified HEAD on qemu_x86 with * !USERSPACE and SCHED_DUMB and using -icount * shift=0,sleep=off,align=off, I get results of: * * unpend 132 ready 257 switch 278 pend 321 tot 988 (avg 900) */ printk("unpend %4d ready %4d switch %4d pend %4d tot %4d (avg %4d)\n", stamps[1] - stamps[0], stamps[2] - stamps[1], stamps[3] - stamps[2], stamps[4] - stamps[3], whole, avg); } printk("fin\n"); return 0; }