Implement a set of per-cpu trampoline stacks which all interrupts and exceptions will initially land on, and also as an intermediate stack for privilege changes as we need some stack space to swap page tables. Set up the special trampoline page which contains all the trampoline stacks, TSS, and GDT. This page needs to be present in the user page tables or interrupts don't work. CPU exceptions, with KPTI turned on, are treated as interrupts and not traps so that we have IRQs locked on exception entry. Add some additional macros for defining IDT entries. Add special handling of locore text/rodata sections when creating user mode page tables on x86-64. Restore qemu_x86_64 to use KPTI, and remove restrictions on enabling user mode on x86-64. Signed-off-by: Andrew Boie <andrew.p.boie@intel.com>
337 lines
8.8 KiB
ArmAsm
337 lines
8.8 KiB
ArmAsm
/*
|
|
* Copyright (c) 2017 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
#include <arch/cpu.h>
|
|
#include <offsets_short.h>
|
|
#include <syscall.h>
|
|
|
|
#ifdef CONFIG_X86_KPTI
|
|
/* Copy interrupt return stack context to the trampoline stack, switch back
|
|
* to the user page table, and only then 'iret'. We jump to this instead
|
|
* of calling 'iret' if KPTI is turned on. This must be invoked with interrupts
|
|
* locked.
|
|
*
|
|
* Stack layout is expected to be what 'iretq' expects, which is as follows:
|
|
*
|
|
* 32 SS
|
|
* 24 RSP
|
|
* 16 RFLAGS
|
|
* 8 CS
|
|
* 0 RIP
|
|
*/
|
|
.global z_x86_trampoline_to_user
|
|
z_x86_trampoline_to_user:
|
|
/* Stash EDI, need a free register */
|
|
pushq %rdi
|
|
|
|
/* Store old stack pointer and switch to trampoline stack */
|
|
movq %rsp, %rdi
|
|
movq %gs:__x86_tss64_t_ist2_OFFSET, %rsp
|
|
|
|
/* Copy context */
|
|
pushq 40(%rdi) /* SS */
|
|
pushq 32(%rdi) /* RSP */
|
|
pushq 24(%rdi) /* RFLAGS */
|
|
pushq 16(%rdi) /* CS */
|
|
pushq 8(%rdi) /* RIP */
|
|
xchgq %rdi, (%rdi) /* Exchange old rdi to restore it and put
|
|
trampoline stack address in its old storage
|
|
area */
|
|
|
|
/* Switch to thread's page table */
|
|
pushq %rax
|
|
movq %gs:__x86_tss64_t_cpu_OFFSET, %rax
|
|
movq ___cpu_t_current_OFFSET(%rax), %rax
|
|
movq _thread_offset_to_ptables(%rax), %rax
|
|
movq %rax, %cr3
|
|
popq %rax
|
|
movq $0, -8(%rsp) /* Delete stashed RAX data */
|
|
|
|
/* Trampoline stack should have nothing sensitive in it at this point */
|
|
swapgs
|
|
iretq
|
|
#endif /* CONFIG_X86_KPTI */
|
|
|
|
|
|
/* Landing site for 'syscall' instruction
|
|
*
|
|
* Call id is in RAX
|
|
* Arguments are in RDI, RSI, RDX, R10, R8, R9
|
|
* Return address stored by CPU in RCX
|
|
* User RFLAGS store by CPU in R11
|
|
* Current RFLAGS has been masked with ~X86_FMASK_MSR
|
|
*/
|
|
.global z_x86_syscall_entry_stub
|
|
z_x86_syscall_entry_stub:
|
|
swapgs
|
|
|
|
/* Save original stack pointer from user mode in memory, at the
|
|
* moment we have no free registers or stack to save it to. This
|
|
* eventually gets put on the stack before we re-enable interrupts
|
|
* as this is a per-cpu and not per-thread area.
|
|
*/
|
|
movq %rsp, %gs:__x86_tss64_t_usp_OFFSET
|
|
|
|
#ifdef CONFIG_X86_KPTI
|
|
/* We need to switch to the trampoline stack so that we can
|
|
* switch to the kernel's page table
|
|
*/
|
|
movq %gs:__x86_tss64_t_ist2_OFFSET, %rsp
|
|
|
|
/* Load kernel's page table */
|
|
pushq %rax
|
|
movq $z_x86_kernel_ptables, %rax
|
|
movq %rax, %cr3
|
|
popq %rax
|
|
movq $0, -8(%rsp) /* Delete stashed RAX data */
|
|
#endif /* CONFIG_X86_KPTI */
|
|
|
|
/* Switch to the privilege mode stack pointer stored in
|
|
* x86_tss64.psp
|
|
*/
|
|
movq %gs:__x86_tss64_t_psp_OFFSET, %rsp
|
|
|
|
/* We're now on the privilege mode stack; push the old user stack
|
|
* pointer onto it
|
|
*/
|
|
pushq %gs:__x86_tss64_t_usp_OFFSET
|
|
#ifdef CONFIG_X86_KPTI
|
|
movq $0, %gs:__x86_tss64_t_usp_OFFSET
|
|
#endif
|
|
|
|
sti /* re-enable interrupts */
|
|
|
|
/* call_id is in RAX. bounds-check it, must be less than
|
|
* K_SYSCALL_LIMIT.
|
|
*/
|
|
cmp $K_SYSCALL_LIMIT, %rax
|
|
jae _bad_syscall
|
|
|
|
_id_ok:
|
|
#ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION
|
|
/* Prevent speculation with bogus system call IDs */
|
|
lfence
|
|
#endif
|
|
|
|
/* Remaining registers not involved in the syscall operation are
|
|
* RBX, RBP, R12-R15, plus floating point / SIMD registers.
|
|
*
|
|
* We save caller-saved registers so we can restore to original values
|
|
* when we call 'sysretq' at the end.
|
|
*/
|
|
pushq %rdi
|
|
subq $X86_FXSAVE_SIZE, %rsp
|
|
fxsave (%rsp)
|
|
pushq %rsi
|
|
pushq %rdx
|
|
pushq %r8
|
|
pushq %r9
|
|
pushq %r10
|
|
pushq %r11 /* RFLAGS */
|
|
pushq %rcx /* Return address stored by 'syscall' */
|
|
pushq %rsp /* SSF parameter */
|
|
|
|
/* All other args are in the right registers, except arg4 which
|
|
* we had to put in r10 instead of RCX
|
|
*/
|
|
movq %r10, %rcx
|
|
|
|
/* from the call ID in RAX, load R10 with the actual function pointer
|
|
* to call by looking it up in the system call dispatch table
|
|
*/
|
|
xorq %r11, %r11
|
|
movq _k_syscall_table(%r11, %rax, 8), %r10
|
|
|
|
/* Run the marshal function, which is some entry in _k_syscall_table */
|
|
call *%r10
|
|
|
|
/* RAX now contains the return value
|
|
*
|
|
* Callee-saved registers are un-touched from original values per C
|
|
* calling convention, but sensitive data may lurk in caller-saved regs
|
|
* RDI, RSI, RDX, R8, R9, R10, XMM* after we have serviced the system
|
|
* call. We saved them earlier, restore their original values when
|
|
* the syscall was made. This also preserves these registers if they
|
|
* were not used as arguments.
|
|
*
|
|
* We also can't have RCX and R11 clobbered as we need the original
|
|
* values to successfully 'sysretq'.
|
|
*/
|
|
addq $8, %rsp /* Discard ssf */
|
|
popq %rcx /* Restore return address for 'sysretq' */
|
|
popq %r11 /* Restore RFLAGS for 'sysretq' */
|
|
popq %r10
|
|
popq %r9
|
|
popq %r8
|
|
popq %rdx
|
|
popq %rsi
|
|
fxrstor (%rsp)
|
|
addq $X86_FXSAVE_SIZE, %rsp
|
|
popq %rdi
|
|
|
|
#ifdef CONFIG_X86_KPTI
|
|
/* Lock IRQs as we are using per-cpu memory areas and the
|
|
* trampoline stack
|
|
*/
|
|
cli
|
|
|
|
/* Stash user stack pointer and switch to trampoline stack */
|
|
popq %gs:__x86_tss64_t_usp_OFFSET
|
|
movq %gs:__x86_tss64_t_ist2_OFFSET, %rsp
|
|
|
|
/* Switch to thread's page table */
|
|
pushq %rax
|
|
movq %gs:__x86_tss64_t_cpu_OFFSET, %rax
|
|
movq ___cpu_t_current_OFFSET(%rax), %rax
|
|
movq _thread_offset_to_ptables(%rax), %rax
|
|
movq %rax, %cr3
|
|
popq %rax
|
|
movq $0, -8(%rsp) /* Delete stashed RAX data */
|
|
|
|
/* Restore saved user stack pointer */
|
|
movq %gs:__x86_tss64_t_usp_OFFSET, %rsp
|
|
movq $0, %gs:__x86_tss64_t_usp_OFFSET
|
|
#else
|
|
/* Restore user stack pointer */
|
|
popq %rsp
|
|
|
|
/* Return to user mode, locking interrupts as the normal interrupt
|
|
* handling path will get very confused if it occurs between
|
|
* 'swapgs' and 'sysretq'
|
|
*/
|
|
cli
|
|
#endif /* CONFIG_X86_KPTI */
|
|
|
|
swapgs
|
|
sysretq
|
|
|
|
_bad_syscall:
|
|
/* RAX had a bogus syscall value in it, replace with the bad syscall
|
|
* handler's ID, and put the bad ID as its first argument.
|
|
*
|
|
* TODO: On this and all other arches, simply immediately return
|
|
* with -ENOSYS, once all syscalls have a return value
|
|
*/
|
|
movq %rax, %rdi
|
|
movq $K_SYSCALL_BAD, %rax
|
|
jmp _id_ok
|
|
|
|
/*
|
|
* size_t arch_user_string_nlen(const char *s, size_t maxsize, int *err_arg)
|
|
* ^ RDI ^ RSI ^ RDX
|
|
*/
|
|
.global arch_user_string_nlen
|
|
arch_user_string_nlen:
|
|
/* Initial error value, strlen_done adjusts this if we succeed */
|
|
movl $-1, %r8d
|
|
|
|
/* use RAX as our length count (this function's return value) */
|
|
xor %rax, %rax
|
|
|
|
/* This code might page fault */
|
|
strlen_loop:
|
|
.global z_x86_user_string_nlen_fault_start
|
|
z_x86_user_string_nlen_fault_start:
|
|
cmpb $0x0, (%rdi, %rax, 1) /* *(RDI + RAX) == 0? Could fault. */
|
|
|
|
.global z_x86_user_string_nlen_fault_end
|
|
z_x86_user_string_nlen_fault_end:
|
|
je strlen_done
|
|
cmp %rsi, %rax /* Max length reached? */
|
|
je strlen_done
|
|
inc %rax /* EAX++ and loop again */
|
|
jmp strlen_loop
|
|
|
|
strlen_done:
|
|
/* Set error value to 0 since we succeeded */
|
|
xorl %r8d, %r8d
|
|
|
|
.global z_x86_user_string_nlen_fixup
|
|
z_x86_user_string_nlen_fixup:
|
|
/* Write error value to 32-bit integer err pointer parameter */
|
|
movl %r8d, (%rdx)
|
|
retq
|
|
|
|
/*
|
|
* Trampoline function to put the p3 parameter in the register expected
|
|
* by the calling convention, we couldn't use RCX when we called 'sysret'
|
|
*/
|
|
z_x86_userspace_landing_site:
|
|
/* Place argument 4 in the correct position */
|
|
movq %r10, %rcx
|
|
call z_thread_entry
|
|
|
|
/* FUNC_NORETURN void z_x86_userspace_enter(
|
|
* k_thread_entry_t user_entry, <- RDI
|
|
* void *p1, void *p2, void *p3, <- RSI, RDX, RCX
|
|
* uintptr_t stack_end, <- R8
|
|
* uintptr_t stack_start) <- R9
|
|
*
|
|
* A one-way trip to userspace.
|
|
*/
|
|
.global z_x86_userspace_enter
|
|
z_x86_userspace_enter:
|
|
/* RCX is sysret return address, pass along p3 in r10,
|
|
* z_x86_userspace_landing_site will fix this up
|
|
*/
|
|
movq %rcx, %r10
|
|
|
|
/* switch to privilege mode stack so we can erase thread stack buffer,
|
|
* the buffer is the page immediately before the thread stack
|
|
*/
|
|
movq %r9, %rsp
|
|
|
|
/* Need RDI temporarily */
|
|
pushq %rdi
|
|
|
|
/* Compute size of user stack in 8-byte chunks and put in RCX */
|
|
movq %r9, %rdi /* Start address for rep stosq in RDI */
|
|
movq %r8, %rcx /* Ending address */
|
|
subq %rdi, %rcx /* Subtract starting address */
|
|
shrq $3, %rcx /* Divide by 8 */
|
|
|
|
movq $0xAAAAAAAAAAAAAAAA, %rax /* Fill value */
|
|
/* Copy 8 bytes of memory at a time, starting at ES:RDI, with whatever
|
|
* is in RAX. Repeat this RCX times. Stack sizes are always at least
|
|
* 8-byte aligned.
|
|
*/
|
|
cld
|
|
rep stosq
|
|
|
|
popq %rdi
|
|
|
|
/* Reset to the beginning of the user stack */
|
|
movq %r8, %rsp
|
|
|
|
/* set sysret entry point */
|
|
movq $z_x86_userspace_landing_site, %rcx
|
|
|
|
/* Copy RFLAGS into r11, required by sysret */
|
|
pushfq
|
|
movq (%rsp), %r11
|
|
movq $0, (%rsp) /* Now a debugger-friendly return address */
|
|
|
|
/* cleanse other registers */
|
|
xorq %rbx, %rbx
|
|
xorq %rbp, %rbp
|
|
xorq %r12, %r12
|
|
xorq %r13, %r13
|
|
xorq %r14, %r14
|
|
xorq %r15, %r15
|
|
|
|
cli
|
|
#ifdef CONFIG_X86_KPTI
|
|
/* Switch to thread's page table. We have free registers so no need
|
|
* to involve the trampoline stack.
|
|
*/
|
|
movq %gs:__x86_tss64_t_cpu_OFFSET, %rax
|
|
movq ___cpu_t_current_OFFSET(%rax), %rax
|
|
movq _thread_offset_to_ptables(%rax), %rax
|
|
movq %rax, %cr3
|
|
#endif
|
|
swapgs
|
|
sysretq
|