[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 5/5] x86_64: add 64-bit syscall entry point
From: |
Luca Dariz |
Subject: |
[PATCH 5/5] x86_64: add 64-bit syscall entry point |
Date: |
Mon, 27 Feb 2023 21:45:01 +0100 |
While theoretically we could still use the same call gate as for
32-bit userspace, it doesn't seem very common, and gcc seems to not
encode properly the instruction. Instead we use syscall/sysret as
other kernels (e.g. XNU,Linux). This version still has some
limitations, but should be enough to start working on the 64-bit user
space.
* i386/i386/i386asm.sym: add more constants to fill pcb->iss
* i386/i386/ldt.c: configure 64-bit syscall entry point
* i386/i386/ldt.h: swap CS/DS segments order if !USER32 as required by
sysret
* i386/i386/locore.h: add syscall64 and MSR definitions
* i386/include/mach/i386/syscall_sw.h: add simple entry point from
user space. This is just for simple tests, it seems glibc doesn't
use this
* x86_64/locore.S: implement syscall64 entry point
---
i386/i386/i386asm.sym | 11 +++
i386/i386/ldt.c | 15 ++-
i386/i386/ldt.h | 7 +-
i386/i386/locore.h | 29 ++++++
i386/include/mach/i386/syscall_sw.h | 16 ++--
x86_64/locore.S | 136 ++++++++++++++++++++++++++++
6 files changed, 204 insertions(+), 10 deletions(-)
diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym
index 8317db6c..733cc4eb 100644
--- a/i386/i386/i386asm.sym
+++ b/i386/i386/i386asm.sym
@@ -52,6 +52,8 @@ expr CALL_SINGLE_FUNCTION_BASE
offset ApicLocalUnit lu apic_id APIC_ID
+offset pcb pcb iss
+
offset thread th pcb
offset thread th task
offset thread th recover
@@ -82,9 +84,15 @@ size i386_kernel_state iks
size i386_exception_link iel
+offset i386_saved_state r gs
+offset i386_saved_state r fs
offset i386_saved_state r cs
offset i386_saved_state r uesp
offset i386_saved_state r eax
+offset i386_saved_state r ebx
+offset i386_saved_state r ecx
+offset i386_saved_state r edx
+offset i386_saved_state r ebp
offset i386_saved_state r trapno
offset i386_saved_state r err
offset i386_saved_state r efl R_EFLAGS
@@ -92,6 +100,9 @@ offset i386_saved_state r eip
offset i386_saved_state r cr2
offset i386_saved_state r edi
#ifdef __x86_64__
+offset i386_saved_state r r12
+offset i386_saved_state r r13
+offset i386_saved_state r r14
offset i386_saved_state r r15
#endif
diff --git a/i386/i386/ldt.c b/i386/i386/ldt.c
index b86a0e3c..61a03d65 100644
--- a/i386/i386/ldt.c
+++ b/i386/i386/ldt.c
@@ -31,6 +31,7 @@
#include <mach/xen.h>
#include <intel/pmap.h>
+#include <kern/debug.h>
#include "vm_param.h"
#include "seg.h"
@@ -65,10 +66,22 @@ ldt_fill(struct real_descriptor *myldt, struct
real_descriptor *mygdt)
ACC_PL_K|ACC_LDT, 0);
#endif /* MACH_PV_DESCRIPTORS */
- /* Initialize the 32bit LDT descriptors. */
+ /* Initialize the syscall entry point */
+#if defined(__x86_64__) && ! defined(USER32)
+ if (!(CPU_HAS_FEATURE(CPU_FEATURE_MSR) &&
CPU_HAS_FEATURE(CPU_FEATURE_SEP)))
+ panic("syscall support is missing on 64 bit");
+ /* Enable 64-bit syscalls */
+ wrmsr(MSR_REG_EFER, rdmsr(MSR_REG_EFER) | MSR_EFER_SCE);
+ wrmsr(MSR_REG_LSTAR, syscall64);
+ wrmsr(MSR_REG_STAR, ((((long)USER_CS - 16) << 16) | (long)KERNEL_CS)
<< 32);
+ wrmsr(MSR_REG_FMASK, 0); // ?
+#else /* defined(__x86_64__) && ! defined(USER32) */
fill_ldt_gate(myldt, USER_SCALL,
(vm_offset_t)&syscall, KERNEL_CS,
ACC_PL_U|ACC_CALL_GATE, 0);
+#endif /* defined(__x86_64__) && ! defined(USER32) */
+
+ /* Initialize the 32bit LDT descriptors. */
fill_ldt_descriptor(myldt, USER_CS,
VM_MIN_USER_ADDRESS,
VM_MAX_USER_ADDRESS-VM_MIN_USER_ADDRESS-4096,
diff --git a/i386/i386/ldt.h b/i386/i386/ldt.h
index b15f11a5..4490f99f 100644
--- a/i386/i386/ldt.h
+++ b/i386/i386/ldt.h
@@ -45,9 +45,14 @@
#define USER_SCALL 0x07 /* system call gate */
#ifdef __x86_64__
/* Call gate needs two entries */
-#endif
+
+/* The sysret instruction puts some constraints on the user segment indexes */
+#define USER_CS 0x1f /* user code segment */
+#define USER_DS 0x17 /* user data segment */
+#else
#define USER_CS 0x17 /* user code segment */
#define USER_DS 0x1f /* user data segment */
+#endif
#define LDTSZ 4
diff --git a/i386/i386/locore.h b/i386/i386/locore.h
index a8807dbf..39545ff5 100644
--- a/i386/i386/locore.h
+++ b/i386/i386/locore.h
@@ -57,6 +57,7 @@ extern int inst_fetch (int eip, int cs);
extern void cpu_shutdown (void);
extern int syscall (void);
+extern int syscall64 (void);
extern unsigned int cpu_features[2];
@@ -93,5 +94,33 @@ extern unsigned int cpu_features[2];
#define CPU_HAS_FEATURE(feature) (cpu_features[(feature) / 32] & (1 <<
((feature) % 32)))
+#define MSR_REG_EFER 0xC0000080
+#define MSR_REG_STAR 0xC0000081
+#define MSR_REG_LSTAR 0xC0000082
+#define MSR_REG_CSTAR 0xC0000083
+#define MSR_REG_FMASK 0xC0000084
+
+#define MSR_EFER_SCE 0x00000001
+
+static inline void wrmsr(uint32_t regaddr, uint64_t value)
+{
+ uint32_t low=(uint32_t)value, high=((uint32_t)(value >> 32));
+ asm volatile("wrmsr\n" \
+ : \
+ : "c" (regaddr), "a" (low), "d" (high) \
+ : "memory" \
+ );
+}
+
+static inline uint64_t rdmsr(uint32_t regaddr)
+{
+ uint32_t low, high;
+ asm volatile("rdmsr\n" \
+ : "=a" (low), "=d" (high) \
+ : "c" (regaddr) \
+ );
+ return ((uint64_t)high << 32) | low;
+}
+
#endif /* _MACHINE__LOCORE_H_ */
diff --git a/i386/include/mach/i386/syscall_sw.h
b/i386/include/mach/i386/syscall_sw.h
index 86f6ff2f..20ef7c13 100644
--- a/i386/include/mach/i386/syscall_sw.h
+++ b/i386/include/mach/i386/syscall_sw.h
@@ -29,16 +29,16 @@
#include <mach/machine/asm.h>
-#if BSD_TRAP
-#define kernel_trap(trap_name,trap_number,number_args) \
-ENTRY(trap_name) \
- movl $ trap_number,%eax; \
- SVC; \
- jb LCL(cerror); \
- ret; \
+#if defined(__x86_64__) && ! defined(USER32)
+#define kernel_trap(trap_name,trap_number,number_args) \
+ENTRY(trap_name) \
+ movq $ trap_number,%rax; \
+ movq %rcx,%r10; \
+ syscall; \
+ ret; \
END(trap_name)
#else
-#define kernel_trap(trap_name,trap_number,number_args) \
+#define kernel_trap(trap_name,trap_number,number_args) \
ENTRY(trap_name) \
movl $ trap_number,%eax; \
SVC; \
diff --git a/x86_64/locore.S b/x86_64/locore.S
index 47d9085c..fdf7300b 100644
--- a/x86_64/locore.S
+++ b/x86_64/locore.S
@@ -1281,6 +1281,142 @@ DATA(cpu_features_ecx)
END(syscall)
+
+/* Entry point for 64-bit syscalls.
+ * On entry we're still on the user stack, so better not use it. Instead we
+ * save the thread state immediately in thread->pcb->iss, then try to invoke
+ * the syscall.
+ * TODO:
+ - for now we assume the return address is canonical, but apparently there
+ can be cases where it's not (see how Linux handles this). Does it apply
+ here?
+ - do we need to check for ast on syscalls? Maybe on interrupts is enough
+ - check that the case where a task is suspended, and later returns via
+ iretq from return_from_trap, works fine in all combinations
+ - emulated syscalls - are they used anywhere?
+ */
+ENTRY(syscall64)
+ /* RFLAGS[32:63] are reserved, so combine syscall num (32 bit) and
+ * eflags in RAX to allow using r11 as temporary register */
+ shlq $32,%r11
+ shlq $32,%rax /* make sure bits 32:63 of %rax are zero */
+ shrq $32,%rax
+ or %r11,%rax
+
+ /* Save thread state in pcb->iss, as on exception entry.
+ * Since this is triggered synchronously from userspace, we can
+ * save only the callee-preserved status according to the C ABI,
+ * plus RIP and EFLAGS for sysret */
+ CPU_NUMBER(%r11)
+ movq CX(EXT(active_threads),%r11),%r11 /* point to current thread */
+ movq TH_PCB(%r11),%r11 /* point to pcb */
+ addq $ PCB_ISS,%r11 /* point to saved state */
+
+ mov %gs,R_GS(%r11)
+ mov %fs,R_FS(%r11)
+ mov %rsp,R_UESP(%r11) /* callee-preserved register */
+ mov %rcx,R_EIP(%r11) /* syscall places user RIP in RCX */
+ mov %rbx,R_EBX(%r11) /* callee-preserved register */
+ mov %rax,%rbx /* Now we can unpack eflags again */
+ shr $32,%rbx
+ mov %rbx,R_EFLAGS(%r11) /* ... and save them in pcb as well */
+ mov %rbp,R_EBP(%r11) /* callee-preserved register */
+ mov %r12,R_R12(%r11) /* callee-preserved register */
+ mov %r13,R_R13(%r11) /* callee-preserved register */
+ mov %r14,R_R14(%r11) /* callee-preserved register */
+ mov %r15,R_R15(%r11) /* callee-preserved register */
+ mov %r11,%rbx /* prepare for error handling */
+ mov %r10,%rcx /* fix arg3 location according to C ABI
*/
+
+ /* switch to kernel stack */
+ CPU_NUMBER(%r11)
+ movq CX(EXT(kernel_stack),%r11),%rsp
+
+ /* Now we have saved state and args 1-6 are in place.
+ * Before invoking the syscall we do some bound checking and,
+ * if we have more that 6 arguments, we need to copy the
+ * remaining ones to the kernel stack, handling page faults when
+ * accessing the user stack.
+ */
+ shlq $32,%rax /* make sure bits 32:63 of %rax are
zero */
+ shrq $32,%rax
+ negl %eax /* get system call number */
+ jl _syscall64_range /* out of range if it was positive */
+ cmpl EXT(mach_trap_count),%eax /* check system call table
bounds */
+ jg _syscall64_range /* error if out of range */
+ shll $5,%eax /* manual indexing of mach_trap_t */
+
+ /* check if we need to place some arguments on the stack */
+_syscall64_args_stack:
+ mov EXT(mach_trap_table)(%rax),%r10 /* get number of arguments */
+ subq $6,%r10 /* the first 6 args are already in
place */
+ jl _syscall64_call /* skip argument copy if >6 args */
+
+ movq R_UESP(%rbx),%r11 /* get user stack pointer */
+ addq $8,%r11 /* Skip user return address */
+
+ mov $USER_DS,%r12 /* use user data segment for accesses */
+ mov %r12,%fs
+
+ lea (%r11,%r10,8),%r11 /* point past last argument */
+ xorq %r12,%r12
+
+0: subq $8,%r11
+ RECOVER(_syscall64_addr_push)
+ mov %fs:(%r11),%r12
+ pushq %r12 /* push argument on stack */
+ dec %r10
+ jnz 0b /* loop for all remaining arguments */
+
+_syscall64_call:
+ call *EXT(mach_trap_table)+8(%rax) /* call procedure */
+ // XXX: check ast on exit?
+
+ /* avoid leaking information in callee-clobbered registers */
+ mov $0,%rdi
+ mov $0,%rsi
+ mov $0,%rdx
+ mov $0,%r10
+ mov $0,%r9
+ mov $0,%r8
+
+ /* restore thread state and return to user using sysret */
+ CPU_NUMBER(%r11)
+ movq CX(EXT(active_threads),%r11),%r11 /* point to current thread */
+ movq TH_PCB(%r11),%r11 /* point to pcb */
+ addq $ PCB_ISS,%r11 /* point to saved state */
+
+ mov R_GS(%r11),%gs
+ mov R_FS(%r11),%fs
+ mov R_UESP(%r11),%rsp /* callee-preserved register,
+ * switch to user stack */
+ mov R_EIP(%r11),%rcx /* sysret convention */
+ mov R_EBX(%r11),%rbx /* callee-preserved register */
+ mov R_EBP(%r11),%rbp /* callee-preserved register */
+ mov R_R12(%r11),%r12 /* callee-preserved register */
+ mov R_R13(%r11),%r13 /* callee-preserved register */
+ mov R_R14(%r11),%r14 /* callee-preserved register */
+ mov R_R15(%r11),%r15 /* callee-preserved register */
+ mov R_EFLAGS(%r11),%r11 /* sysret convention */
+
+ sysretq /* fast return to user-space, the thread didn't block */
+
+/* Error handling fragments, from here we jump directly to the trap handler */
+_syscall64_addr_push:
+ movq %rbx,%rsp /* clean parameters from stack */
+ movq %r11,R_CR2(%rbx) /* set fault address */
+ movq $(T_PAGE_FAULT),R_TRAPNO(%rbx) /* set page-fault trap */
+ movq $(T_PF_USER),R_ERR(%rbx) /* set error code - read user space */
+ jmp _take_trap /* treat as a trap */
+
+_syscall64_range:
+ movq $(T_INVALID_OPCODE),R_TRAPNO(%rbx)
+ /* set invalid-operation trap */
+ movq $0,R_ERR(%rbx) /* clear error code */
+ jmp _take_trap /* treat as a trap */
+
+END(syscall64)
+
/* Discover what kind of cpu we have; return the family number
(3, 4, 5, 6, for 386, 486, 586, 686 respectively). */
ENTRY(discover_x86_cpu_type)
--
2.30.2
- [PATCH 0/5] basic syscall support on x86_64, Luca Dariz, 2023/02/27
- [PATCH 4/5] x86_64: fix user trap during syscall with an invalid user stack, Luca Dariz, 2023/02/27
- [PATCH 3/5] fix port name copyin, Luca Dariz, 2023/02/27
- [PATCH 1/5] x86_64: allow compilation if ! USER32, Luca Dariz, 2023/02/27
- [PATCH 2/5] fix copyin/outmsg header for ! USER32, Luca Dariz, 2023/02/27
- [PATCH 5/5] x86_64: add 64-bit syscall entry point,
Luca Dariz <=
- Re: [PATCH 5/5] x86_64: add 64-bit syscall entry point, Samuel Thibault, 2023/02/27
- Re: [PATCH 5/5] x86_64: add 64-bit syscall entry point, Luca Dariz, 2023/02/28
- Re: [PATCH 5/5] x86_64: add 64-bit syscall entry point, Samuel Thibault, 2023/02/28
- Re: [PATCH 5/5] x86_64: add 64-bit syscall entry point, Sergey Bugaev, 2023/02/28
- Re: [PATCH 5/5] x86_64: add 64-bit syscall entry point, Samuel Thibault, 2023/02/28
- Re: [PATCH 5/5] x86_64: add 64-bit syscall entry point, Sergey Bugaev, 2023/02/28
- Re: [PATCH 5/5] x86_64: add 64-bit syscall entry point, Samuel Thibault, 2023/02/28
- Re: [PATCH 5/5] x86_64: add 64-bit syscall entry point, Sergey Bugaev, 2023/02/28
Re: [PATCH 5/5] x86_64: add 64-bit syscall entry point, Sergey Bugaev, 2023/02/28