zephyr/include/arch/x86/arch.h
Daniel Leung 80fb6538b3 x86: use =A as output for RDTSC on x86-32
The timing_info benchmark on qemu_x86 shows this is a bit faster.

Before:
  START - Time Measurement
  Timing results: Clock frequency: 1000 MHz
  Context switch                               : 896 cycles ,   895 ns
  Interrupt latency                            : 768 cycles ,   767 ns
  Tick overhead                                :14912 cycles , 14911 ns
  Thread creation                              :18688 cycles , 18687 ns
  Thread abort (non-running)                   :49216 cycles , 49215 ns
  Thread abort (_current)                      :55616 cycles , 55615 ns
  Thread suspend                               :11072 cycles , 11071 ns
  Thread resume                                :10272 cycles , 10271 ns
  Thread yield                                 :12213 cycles , 12212 ns
  Thread sleep                                 :17984 cycles , 17983 ns
  Heap malloc                                  :21702 cycles , 21701 ns
  Heap free                                    :15176 cycles , 15175 ns
  Semaphore take with context switch           :19168 cycles , 19167 ns
  Semaphore give with context switch           :18400 cycles , 18399 ns
  Semaphore take without context switch        :2208 cycles ,  2207 ns
  Semaphore give without context switch        :4704 cycles ,  4703 ns
  Mutex lock                                   :1952 cycles ,  1951 ns
  Mutex unlock                                 :7936 cycles ,  7935 ns
  Message queue put with context switch        :20320 cycles , 20319 ns
  Message queue put without context switch     :5792 cycles ,  5791 ns
  Message queue get with context switch        :22112 cycles , 22111 ns
  Message queue get without context switch     :5312 cycles ,  5311 ns
  Mailbox synchronous put                      :27936 cycles , 27935 ns
  Mailbox synchronous get                      :23392 cycles , 23391 ns
  Mailbox asynchronous put                     :11808 cycles , 11807 ns
  Mailbox get without context switch           :20416 cycles , 20415 ns
  Drop to user mode                            :643712 cycles , 643711 ns
  User thread creation                         :652096 cycles , 652095 ns
  Syscall overhead                             :2720 cycles ,  2719 ns
  Validation overhead k_object init            :4256 cycles ,  4255 ns
  Validation overhead k_object permission      :4224 cycles ,  4223 ns
  Time Measurement finished

After:
  START - Time Measurement
  Timing results: Clock frequency: 1000 MHz
  Context switch                               : 896 cycles ,   895 ns
  Interrupt latency                            : 768 cycles ,   767 ns
  Tick overhead                                :14752 cycles , 14751 ns
  Thread creation                              :18464 cycles , 18463 ns
  Thread abort (non-running)                   :48992 cycles , 48991 ns
  Thread abort (_current)                      :55552 cycles , 55551 ns
  Thread suspend                               :10848 cycles , 10847 ns
  Thread resume                                :10048 cycles , 10047 ns
  Thread yield                                 :12213 cycles , 12212 ns
  Thread sleep                                 :17984 cycles , 17983 ns
  Heap malloc                                  :21702 cycles , 21701 ns
  Heap free                                    :15176 cycles , 15175 ns
  Semaphore take with context switch           :19104 cycles , 19103 ns
  Semaphore give with context switch           :18368 cycles , 18367 ns
  Semaphore take without context switch        :1984 cycles ,  1983 ns
  Semaphore give without context switch        :4480 cycles ,  4479 ns
  Mutex lock                                   :1728 cycles ,  1727 ns
  Mutex unlock                                 :7712 cycles ,  7711 ns
  Message queue put with context switch        :20224 cycles , 20223 ns
  Message queue put without context switch     :5568 cycles ,  5567 ns
  Message queue get with context switch        :22016 cycles , 22015 ns
  Message queue get without context switch     :5088 cycles ,  5087 ns
  Mailbox synchronous put                      :27840 cycles , 27839 ns
  Mailbox synchronous get                      :23296 cycles , 23295 ns
  Mailbox asynchronous put                     :11584 cycles , 11583 ns
  Mailbox get without context switch           :20192 cycles , 20191 ns
  Drop to user mode                            :643616 cycles , 643615 ns
  User thread creation                         :651872 cycles , 651871 ns
  Syscall overhead                             :2464 cycles ,  2463 ns
  Validation overhead k_object init            :4032 cycles ,  4031 ns
  Validation overhead k_object permission      :4000 cycles ,  3999 ns
  Time Measurement finished

Signed-off-by: Daniel Leung <daniel.leung@intel.com>
2020-09-05 13:28:38 -05:00

319 lines
6.3 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2019 Intel Corp.
* SPDX-License-Identifier: Apache-2.0
*/
#ifndef ZEPHYR_INCLUDE_ARCH_X86_ARCH_H_
#define ZEPHYR_INCLUDE_ARCH_X86_ARCH_H_
#include <devicetree.h>
/* Changing this value will require manual changes to exception and IDT setup
* in locore.S for intel64
*/
#define Z_X86_OOPS_VECTOR 32
#if !defined(_ASMLANGUAGE)
#include <sys/sys_io.h>
#include <zephyr/types.h>
#include <stddef.h>
#include <stdbool.h>
#include <irq.h>
#include <arch/x86/mmustructs.h>
#include <arch/x86/thread_stack.h>
#ifdef __cplusplus
extern "C" {
#endif
static ALWAYS_INLINE void arch_irq_unlock(unsigned int key)
{
if ((key & 0x00000200U) != 0U) { /* 'IF' bit */
__asm__ volatile ("sti" ::: "memory");
}
}
static ALWAYS_INLINE void sys_out8(uint8_t data, io_port_t port)
{
__asm__ volatile("outb %b0, %w1" :: "a"(data), "Nd"(port));
}
static ALWAYS_INLINE uint8_t sys_in8(io_port_t port)
{
uint8_t ret;
__asm__ volatile("inb %w1, %b0" : "=a"(ret) : "Nd"(port));
return ret;
}
static ALWAYS_INLINE void sys_out16(uint16_t data, io_port_t port)
{
__asm__ volatile("outw %w0, %w1" :: "a"(data), "Nd"(port));
}
static ALWAYS_INLINE uint16_t sys_in16(io_port_t port)
{
uint16_t ret;
__asm__ volatile("inw %w1, %w0" : "=a"(ret) : "Nd"(port));
return ret;
}
static ALWAYS_INLINE void sys_out32(uint32_t data, io_port_t port)
{
__asm__ volatile("outl %0, %w1" :: "a"(data), "Nd"(port));
}
static ALWAYS_INLINE uint32_t sys_in32(io_port_t port)
{
uint32_t ret;
__asm__ volatile("inl %w1, %0" : "=a"(ret) : "Nd"(port));
return ret;
}
static ALWAYS_INLINE void sys_write8(uint8_t data, mm_reg_t addr)
{
__asm__ volatile("movb %0, %1"
:
: "q"(data), "m" (*(volatile uint8_t *)(uintptr_t) addr)
: "memory");
}
static ALWAYS_INLINE uint8_t sys_read8(mm_reg_t addr)
{
uint8_t ret;
__asm__ volatile("movb %1, %0"
: "=q"(ret)
: "m" (*(volatile uint8_t *)(uintptr_t) addr)
: "memory");
return ret;
}
static ALWAYS_INLINE void sys_write16(uint16_t data, mm_reg_t addr)
{
__asm__ volatile("movw %0, %1"
:
: "r"(data), "m" (*(volatile uint16_t *)(uintptr_t) addr)
: "memory");
}
static ALWAYS_INLINE uint16_t sys_read16(mm_reg_t addr)
{
uint16_t ret;
__asm__ volatile("movw %1, %0"
: "=r"(ret)
: "m" (*(volatile uint16_t *)(uintptr_t) addr)
: "memory");
return ret;
}
static ALWAYS_INLINE void sys_write32(uint32_t data, mm_reg_t addr)
{
__asm__ volatile("movl %0, %1"
:
: "r"(data), "m" (*(volatile uint32_t *)(uintptr_t) addr)
: "memory");
}
static ALWAYS_INLINE uint32_t sys_read32(mm_reg_t addr)
{
uint32_t ret;
__asm__ volatile("movl %1, %0"
: "=r"(ret)
: "m" (*(volatile uint32_t *)(uintptr_t) addr)
: "memory");
return ret;
}
static ALWAYS_INLINE void sys_set_bit(mem_addr_t addr, unsigned int bit)
{
__asm__ volatile("btsl %1, %0"
: "+m" (*(volatile uint32_t *) (addr))
: "Ir" (bit)
: "memory");
}
static ALWAYS_INLINE void sys_clear_bit(mem_addr_t addr, unsigned int bit)
{
__asm__ volatile("btrl %1, %0"
: "+m" (*(volatile uint32_t *) (addr))
: "Ir" (bit));
}
static ALWAYS_INLINE int sys_test_bit(mem_addr_t addr, unsigned int bit)
{
int ret;
__asm__ volatile("btl %2, %1;"
"sbb %0, %0"
: "=r" (ret), "+m" (*(volatile uint32_t *) (addr))
: "Ir" (bit));
return ret;
}
static ALWAYS_INLINE int sys_test_and_set_bit(mem_addr_t addr,
unsigned int bit)
{
int ret;
__asm__ volatile("btsl %2, %1;"
"sbb %0, %0"
: "=r" (ret), "+m" (*(volatile uint32_t *) (addr))
: "Ir" (bit));
return ret;
}
static ALWAYS_INLINE int sys_test_and_clear_bit(mem_addr_t addr,
unsigned int bit)
{
int ret;
__asm__ volatile("btrl %2, %1;"
"sbb %0, %0"
: "=r" (ret), "+m" (*(volatile uint32_t *) (addr))
: "Ir" (bit));
return ret;
}
#define sys_bitfield_set_bit sys_set_bit
#define sys_bitfield_clear_bit sys_clear_bit
#define sys_bitfield_test_bit sys_test_bit
#define sys_bitfield_test_and_set_bit sys_test_and_set_bit
#define sys_bitfield_test_and_clear_bit sys_test_and_clear_bit
/*
* Map of IRQ numbers to their assigned vectors. On IA32, this is generated
* at build time and defined via the linker script. On Intel64, it's an array.
*/
extern unsigned char _irq_to_interrupt_vector[];
#define Z_IRQ_TO_INTERRUPT_VECTOR(irq) \
((unsigned int) _irq_to_interrupt_vector[irq])
#endif /* _ASMLANGUAGE */
#ifdef __cplusplus
}
#endif
#include <drivers/interrupt_controller/sysapic.h>
#ifdef CONFIG_X86_64
#include <arch/x86/intel64/arch.h>
#else
#include <arch/x86/ia32/arch.h>
#endif
#include <arch/common/ffs.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifndef _ASMLANGUAGE
extern void arch_irq_enable(unsigned int irq);
extern void arch_irq_disable(unsigned int irq);
extern uint32_t z_timer_cycle_get_32(void);
static inline uint32_t arch_k_cycle_get_32(void)
{
return z_timer_cycle_get_32();
}
static ALWAYS_INLINE bool arch_irq_unlocked(unsigned int key)
{
return (key & 0x200) != 0;
}
/**
* @brief read timestamp register, 32-bits only, unserialized
*/
static ALWAYS_INLINE uint32_t z_do_read_cpu_timestamp32(void)
{
uint32_t rv;
__asm__ volatile("rdtsc" : "=a" (rv) : : "%edx");
return rv;
}
/**
* @brief read timestamp register ensuring serialization
*/
static inline uint64_t z_tsc_read(void)
{
union {
struct {
uint32_t lo;
uint32_t hi;
};
uint64_t value;
} rv;
#ifdef CONFIG_X86_64
/*
* According to Intel 64 and IA-32 Architectures Software
* Developers Manual, volume 3, chapter 8.2.5, LFENCE provides
* a more efficient method of controlling memory ordering than
* the CPUID instruction. So use LFENCE here, as all 64-bit
* CPUs have LFENCE.
*/
__asm__ volatile ("lfence");
#else
/* rdtsc & cpuid clobbers eax, ebx, ecx and edx registers */
__asm__ volatile (/* serialize */
"xorl %%eax,%%eax;"
"cpuid"
:
:
: "%eax", "%ebx", "%ecx", "%edx"
);
#endif
#ifdef CONFIG_X86_64
/*
* We cannot use "=A", since this would use %rax on x86_64 and
* return only the lower 32bits of the TSC
*/
__asm__ volatile ("rdtsc" : "=a" (rv.lo), "=d" (rv.hi));
#else
/* "=A" means that value is in eax:edx pair. */
__asm__ volatile ("rdtsc" : "=A" (rv.value));
#endif
return rv.value;
}
static ALWAYS_INLINE void arch_nop(void)
{
__asm__ volatile("nop");
}
#endif /* _ASMLANGUAGE */
#ifdef __cplusplus
}
#endif
#endif /* ZEPHYR_INCLUDE_ARCH_X86_ARCH_H_ */