I have only skimmed the diffs. Your knowledge of the deep semantics, gained by close differential reading of intel and amd docs, is truly amazing. Many thanks for pushing this through!
Thanks for bringing this to our attention too, apart from the practical bug hopefully it will help future readers to have a more precise implementation.
I tried to acknowledge your contribution in the commit messages.
I have 2 nits, perhaps stylistic only.
For code like "sp -= 2" or "sp += 2" followed or preceded by a write to the stack pointer of a uint16_t variable 'x', would it be better/more robust to rewrite as: "sp -= sizeof(x)" ?
I think that's intentional because the value subtracted is related to the "stw" or "stl" in the store (likewise for incrementing after a load) more than to the size of x.
There are a lot of masks constructed using -1. I think it would be clearer to use 0xffffffff (for 32-bit masks) as that reminds the reader that this is a bit mask. But it seems that using -1 is how the original code was written.
-1 is used for 64-bit masks only. They get unwieldy quickly. :)
Paolo
This includes bugfixes:
- allowing IRET from user mode to user mode with SMAP (do not use implicit
kernel accesses, which break if the stack is in userspace)
- use DPL-level accesses for interrupts and call gates
- various fixes for task switching
And two related cleanups: computing MMU index once for far calls and returns
(including task switches), and using X86Access for TSS access.
Tested with a really ugly patch to kvm-unit-tests, included after signature.
Paolo Bonzini (7):
target/i386/tcg: Allow IRET from user mode to user mode with SMAP
target/i386/tcg: use PUSHL/PUSHW for error code
target/i386/tcg: Compute MMU index once
target/i386/tcg: Use DPL-level accesses for interrupts and call gates
target/i386/tcg: check for correct busy state before switching to a
new task
target/i386/tcg: use X86Access for TSS access
target/i386/tcg: save current task state before loading new one
Richard Henderson (3):
target/i386/tcg: Remove SEG_ADDL
target/i386/tcg: Reorg push/pop within seg_helper.c
target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl
target/i386/cpu.h | 11 +-
target/i386/cpu.c | 27 +-
target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++----------------
3 files changed, 354 insertions(+), 290 deletions(-)
--
2.45.2
diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c
index c3ec0ad7..0bf40c6d 100644
--- a/lib/x86/usermode.c
+++ b/lib/x86/usermode.c
@@ -5,13 +5,15 @@
#include "x86/desc.h"
#include "x86/isr.h"
#include "alloc.h"
+#include "alloc_page.h"
#include "setjmp.h"
#include "usermode.h"
#include "libcflat.h"
#include <stdint.h>
-#define USERMODE_STACK_SIZE 0x2000
+#define USERMODE_STACK_ORDER 1 /* 8k */
+#define USERMODE_STACK_SIZE (1 << (12 + USERMODE_STACK_ORDER))
#define RET_TO_KERNEL_IRQ 0x20
static jmp_buf jmpbuf;
@@ -37,9 +39,14 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
{
extern char ret_to_kernel;
volatile uint64_t rax = 0;
- static unsigned char user_stack[USERMODE_STACK_SIZE];
+ static unsigned char *user_stack;
handler old_ex;
+ if (!user_stack) {
+ user_stack = alloc_pages(USERMODE_STACK_ORDER);
+ printf("%p\n", user_stack);
+ }
+
*raised_vector = 0;
set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3);
old_ex = handle_exception(fault_vector,
@@ -51,6 +58,8 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
return 0;
}
+ memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8);
+
asm volatile (
/* Prepare kernel SP for exception handlers */
"mov %%rsp, %[rsp0]\n\t"
@@ -63,12 +72,13 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
"pushq %[user_stack_top]\n\t"
"pushfq\n\t"
"pushq %[user_cs]\n\t"
- "lea user_mode(%%rip), %%rax\n\t"
+ "lea user_mode+0x800000(%%rip), %%rax\n\t" // smap.flat places usermode addresses at 8MB-16MB
"pushq %%rax\n\t"
"iretq\n"
"user_mode:\n\t"
/* Back up volatile registers before invoking func */
+ "pop %%rax\n\t"
"push %%rcx\n\t"
"push %%rdx\n\t"
"push %%rdi\n\t"
@@ -78,11 +88,12 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
"push %%r10\n\t"
"push %%r11\n\t"
/* Call user mode function */
+ "add $0x800000,%%rbp\n\t"
"mov %[arg1], %%rdi\n\t"
"mov %[arg2], %%rsi\n\t"
"mov %[arg3], %%rdx\n\t"
"mov %[arg4], %%rcx\n\t"
- "call *%[func]\n\t"
+ "call *%%rax\n\t"
/* Restore registers */
"pop %%r11\n\t"
"pop %%r10\n\t"
@@ -112,12 +123,11 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
[arg2]"m"(arg2),
[arg3]"m"(arg3),
[arg4]"m"(arg4),
- [func]"m"(func),
[user_ds]"i"(USER_DS),
[user_cs]"i"(USER_CS),
[kernel_ds]"rm"(KERNEL_DS),
[user_stack_top]"r"(user_stack +
- sizeof(user_stack)),
+ USERMODE_STACK_SIZE - 8),
[kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ));
handle_exception(fault_vector, old_ex);
diff --git a/x86/smap.c b/x86/smap.c
index 9a823a55..65119442 100644
--- a/x86/smap.c
+++ b/x86/smap.c
@@ -2,6 +2,7 @@
#include <alloc_page.h>
#include "x86/desc.h"
#include "x86/processor.h"
+#include "x86/usermode.h"
#include "x86/vm.h"
volatile int pf_count = 0;
@@ -89,6 +90,31 @@ static void check_smap_nowp(void)
write_cr3(read_cr3());
}
+#ifdef __x86_64__
+static void iret(void)
+{
+ asm volatile(
+ "mov %%rsp, %%rcx;"
+ "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;"
+ "pushf;"
+ "movl %%cs, %%ebx; pushq %%rbx; "
+ "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:"
+
+ : : : "ebx", "ecx", "cc"); /* RPL=0 */
+}
+
+static void test_user_iret(void)
+{
+ bool raised_vector;
+ uintptr_t user_iret = (uintptr_t)iret + USER_BASE;
+
+ run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0,
+ &raised_vector);
+
+ report(!raised_vector, "No #PF on CPL=3 DPL=3 iret");
+}
+#endif
+
int main(int ac, char **av)
{
unsigned long i;
@@ -196,7 +222,9 @@ int main(int ac, char **av)
check_smap_nowp();
- // TODO: implicit kernel access from ring 3 (e.g. int)
+#ifdef __x86_64__
+ test_user_iret();
+#endif
return report_summary();
}