[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [VERY RFC PATCH] x86_64: Rework storing segment registers
From: |
Luca |
Subject: |
Re: [VERY RFC PATCH] x86_64: Rework storing segment registers |
Date: |
Thu, 15 Jun 2023 19:02:39 +0200 |
Hi Sergey,
Il 15/06/23 16:54, Sergey Bugaev ha scritto:
* For USER32, don't store fs/gs base at all
* For !USER32, store fs/gs base outside of the PCB stack
* For !USER32, don't store or touch es, ds, fs, gs (but keep ss and cs)
* For !USER32, disable all of the v86 code
---
Incidentally I have a patch very very similar to this that I'm testing
in various configurations (also thanks to your suggestion about the
stack corruption!), although in my case I had to disable some more code
(e.g. in db_trace.c, db_interface.c, probably because I have kdb enabled).
So I went ahead and just made x86_64 !USER32 not store or access those
segment registers, along with moving fs/gs base out of iss and disabling
v86 (not that I know what v86 is, but it sounds like something we don't
need to support considering we only allow running x86_64 code).
I think V86 is not even used on the 32-bit build, apparently it is a
special protected mode to support older x86 programs...
I have only tested my configuration (x86_64 !USER32 !MACH_KDB) -- quite
likely this doesn't build or work in others; but for me it seems to work
very well and I haven't got a single crash, in kernel space or user
space.
Please do review!
debootstrap is still not quite happy. I've uploaded the log here: [0]
[0]: https://paste.gg/p/anonymous/c976008dc38342cd963b0778586ead19
That's a bit strange, with my kernel I can now successfully end the
second stage, and without these warnings. Strangely, I don't see the
"Setting up translators: " line...
On the other hand, I had to fix a tricky combination of syscalls and
interrupts, that for some reason was triggered quite often and caused a
triple fault, together with an issue with nested interrupts (which I'm
not sure are still used, but the code to handle it looks weird).
How are you launching the second stage? For now I used Samuel's archive
as ramdisk, then launched bash and then debootstrap --second-stage:
https://paste.debian.net/1283107/
i386/i386/debug_i386.c | 2 -
i386/i386/i386asm.sym | 4 +-
i386/i386/pcb.c | 30 ++++++++----
i386/i386/thread.h | 27 +++++++++--
x86_64/locore.S | 105 +++++++++++++++++------------------------
5 files changed, 90 insertions(+), 78 deletions(-)
diff --git a/i386/i386/debug_i386.c b/i386/i386/debug_i386.c
index b5465796..41d032e3 100644
--- a/i386/i386/debug_i386.c
+++ b/i386/i386/debug_i386.c
@@ -40,8 +40,6 @@ void dump_ss(const struct i386_saved_state *st)
st->r8, st->r9, st->r10, st->r11);
printf("R12 %016lx R13 %016lx R14 %016lx R15 %016lx\n",
st->r12, st->r13, st->r14, st->r15);
- printf("FSBASE %016lx GSBASE %016lx\n",
- st->fsbase, st->gsbase);
printf("RIP %016lx EFLAGS %08lx\n", st->eip, st->efl);
#else
printf("EAX %08lx EBX %08lx ECX %08lx EDX %08lx\n",
diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym
index fd0be557..8af0c5d6 100644
--- a/i386/i386/i386asm.sym
+++ b/i386/i386/i386asm.sym
@@ -84,8 +84,10 @@ size i386_kernel_state iks
size i386_exception_link iel
+#if !defined(__x86_64__) || defined(USER32)
I'm wondering if we could simplify this case by defining USER32 also on
i386.
offset i386_saved_state r gs
offset i386_saved_state r fs
+#endif
offset i386_saved_state r cs
offset i386_saved_state r uesp
offset i386_saved_state r eax
@@ -108,8 +110,6 @@ offset i386_saved_state r r12
offset i386_saved_state r r13
offset i386_saved_state r r14
offset i386_saved_state r r15
-offset i386_saved_state r fsbase
-offset i386_saved_state r gsbase
#endif
offset i386_interrupt_state i eip
diff --git a/i386/i386/pcb.c b/i386/i386/pcb.c
index fb535709..d8987ddf 100644
--- a/i386/i386/pcb.c
+++ b/i386/i386/pcb.c
@@ -145,9 +145,14 @@ void switch_ktss(pcb_t pcb)
* won`t save the v86 segments, so we leave room.
*/
+#if !defined(__x86_64__) || defined(USER32)
pcb_stack_top = (pcb->iss.efl & EFL_VM)
? (long) (&pcb->iss + 1)
: (long) (&pcb->iss.v86_segs);
+#else
+ pcb_stack_top = (vm_offset_t) (&pcb->iss + 1);
+#endif
+
#ifdef __x86_64__
assert((pcb_stack_top & 0xF) == 0);
#endif
@@ -224,8 +229,8 @@ void switch_ktss(pcb_t pcb)
#endif /* MACH_PV_DESCRIPTORS */
#if defined(__x86_64__) && !defined(USER32)
- wrmsr(MSR_REG_FSBASE, pcb->iss.fsbase);
- wrmsr(MSR_REG_GSBASE, pcb->iss.gsbase);
+ wrmsr(MSR_REG_FSBASE, pcb->isbs.fsbase);
+ wrmsr(MSR_REG_GSBASE, pcb->isbs.gsbase);
#endif
db_load_context(pcb);
@@ -412,10 +417,12 @@ void pcb_init(task_t parent_task, thread_t thread)
*/
pcb->iss.cs = USER_CS;
pcb->iss.ss = USER_DS;
+#if !defined(__x86_64__) || defined(USER32)
pcb->iss.ds = USER_DS;
pcb->iss.es = USER_DS;
pcb->iss.fs = USER_DS;
pcb->iss.gs = USER_DS;
+#endif
pcb->iss.efl = EFL_USER_SET;
thread->pcb = pcb;
@@ -477,6 +484,7 @@ kern_return_t thread_setstatus(
state = (struct i386_thread_state *) tstate;
+#if !defined(__x86_64__) || defined(USER32)
if (flavor == i386_REGS_SEGS_STATE) {
/*
* Code and stack selectors must not be null,
@@ -494,6 +502,7 @@ kern_return_t thread_setstatus(
|| state->ss == 0 || (state->ss & SEL_PL) != SEL_PL_U)
return KERN_INVALID_ARGUMENT;
}
+#endif
saved_state = USER_REGS(thread);
@@ -532,7 +541,6 @@ kern_return_t thread_setstatus(
saved_state->eip = state->eip;
saved_state->efl = (state->efl & ~EFL_USER_CLEAR)
| EFL_USER_SET;
-#endif /* __x86_64__ && !USER32 */
/*
* Segment registers. Set differently in V8086 mode.
@@ -590,6 +598,7 @@ kern_return_t thread_setstatus(
saved_state->fs = state->fs;
saved_state->gs = state->gs;
}
+#endif /* __x86_64__ && !USER32 */
break;
}
@@ -631,6 +640,7 @@ kern_return_t thread_setstatus(
break;
}
+#if !defined(__x86_64__) || defined(USER32)
case i386_V86_ASSIST_STATE:
{
struct i386_v86_assist_state *state;
@@ -657,7 +667,7 @@ kern_return_t thread_setstatus(
USER_REGS(thread)->efl & (EFL_TF | EFL_IF);
break;
}
-
+#endif
case i386_DEBUG_STATE:
{
struct i386_debug_state *state;
@@ -680,8 +690,8 @@ kern_return_t thread_setstatus(
return KERN_INVALID_ARGUMENT;
state = (struct i386_fsgs_base_state *) tstate;
- thread->pcb->iss.fsbase = state->fs_base;
- thread->pcb->iss.gsbase = state->gs_base;
+ thread->pcb->isbs.fsbase = state->fs_base;
+ thread->pcb->isbs.gsbase = state->gs_base;
if (thread == current_thread()) {
wrmsr(MSR_REG_FSBASE, state->fs_base);
wrmsr(MSR_REG_GSBASE, state->gs_base);
@@ -766,7 +776,6 @@ kern_return_t thread_getstatus(
state->uesp = saved_state->uesp;
state->efl = saved_state->efl;
state->esp = 0; /* unused */
-#endif /* __x86_64__ && !USER32 */
state->cs = saved_state->cs;
state->ss = saved_state->ss;
@@ -798,6 +807,7 @@ kern_return_t thread_getstatus(
state->fs = saved_state->fs & 0xffff;
state->gs = saved_state->gs & 0xffff;
}
+#endif /* __x86_64__ && !USER32 */
*count = i386_THREAD_STATE_COUNT;
break;
}
@@ -836,6 +846,7 @@ kern_return_t thread_getstatus(
break;
}
+#if !defined(__x86_64__) || defined(USER32)
case i386_V86_ASSIST_STATE:
{
struct i386_v86_assist_state *state;
@@ -850,6 +861,7 @@ kern_return_t thread_getstatus(
*count = i386_V86_ASSIST_STATE_COUNT;
break;
}
+#endif
case i386_DEBUG_STATE:
{
@@ -872,8 +884,8 @@ kern_return_t thread_getstatus(
return KERN_INVALID_ARGUMENT;
state = (struct i386_fsgs_base_state *) tstate;
- state->fs_base = thread->pcb->iss.fsbase;
- state->gs_base = thread->pcb->iss.gsbase;
+ state->fs_base = thread->pcb->isbs.fsbase;
+ state->gs_base = thread->pcb->isbs.gsbase;
*count = i386_FSGS_BASE_STATE_COUNT;
break;
}
diff --git a/i386/i386/thread.h b/i386/i386/thread.h
index b5fc5ffb..eab762dc 100644
--- a/i386/i386/thread.h
+++ b/i386/i386/thread.h
@@ -51,14 +51,13 @@
*/
struct i386_saved_state {
-#ifdef __x86_64__
- unsigned long fsbase;
- unsigned long gsbase;
-#endif
+#if !defined(__x86_64__) || defined(USER32)
unsigned long gs;
unsigned long fs;
unsigned long es;
unsigned long ds;
+#endif
+
#ifdef __x86_64__
unsigned long r15;
unsigned long r14;
@@ -85,12 +84,15 @@ struct i386_saved_state {
unsigned long efl;
unsigned long uesp;
unsigned long ss;
+
+#if !defined(__x86_64__) || defined(USER32)
struct v86_segs {
unsigned long v86_es; /* virtual 8086 segment registers */
unsigned long v86_ds;
unsigned long v86_fs;
unsigned long v86_gs;
} v86_segs;
+#endif
};
/*
@@ -144,6 +146,7 @@ struct i386_fpsave_state {
};
};
+#if !defined(__x86_64__) || defined(USER32)
/*
* v86_assist_state:
*
@@ -157,6 +160,7 @@ struct v86_assist_state {
unsigned short flags; /* 8086 flag bits */
};
#define V86_IF_PENDING 0x8000 /* unused bit */
+#endif
/*
* i386_interrupt_state:
@@ -167,10 +171,13 @@ struct v86_assist_state {
*/
struct i386_interrupt_state {
+#if !defined(__x86_64__) || defined(USER32)
long gs;
long fs;
long es;
long ds;
+#endif
+
#ifdef __x86_64__
long r11;
long r10;
@@ -187,6 +194,13 @@ struct i386_interrupt_state {
long efl;
};
+#if defined(__x86_64__) && !defined(USER32)
+struct i386_saved_fsgs_base_state {
+ unsigned long fsbase;
+ unsigned long gsbase;
+};
+#endif
+
These can maybe be placed in the i386_machine_state below
/*
* i386_machine_state:
*
@@ -197,7 +211,9 @@ struct i386_interrupt_state {
struct i386_machine_state {
struct user_ldt * ldt;
struct i386_fpsave_state *ifps;
+#if !defined(__x86_64__) || defined(USER32)
struct v86_assist_state v86s;
+#endif
struct real_descriptor user_gdt[USER_GDT_SLOTS];
struct i386_debug_state ids;
};
@@ -209,6 +225,9 @@ typedef struct pcb {
#endif
struct i386_saved_state iss;
struct i386_machine_state ims;
+#if defined(__x86_64__) && !defined(USER32)
+ struct i386_saved_fsgs_base_state isbs;
+#endif
decl_simple_lock_data(, lock)
unsigned short init_control; /* Initial FPU control to set */
#ifdef LINUX_DEV
diff --git a/x86_64/locore.S b/x86_64/locore.S
index 4d61d618..fba2ad03 100644
--- a/x86_64/locore.S
+++ b/x86_64/locore.S
@@ -42,45 +42,6 @@
#define pusha pushq %rax ; pushq %rcx ; pushq %rdx ; pushq %rbx ; subq
$8,%rsp ; pushq %rbp ; pushq %rsi ; pushq %rdi ; pushq %r8 ; pushq %r9 ; pushq
%r10 ; pushq %r11 ; pushq %r12 ; pushq %r13 ; pushq %r14 ; pushq %r15
#define popa popq %r15 ; popq %r14 ; popq %r13 ; popq %r12 ; popq %r11 ; popq
%r10 ; popq %r9 ; popq %r8 ; popq %rdi ; popq %rsi ; popq %rbp ; addq $8,%rsp ;
popq %rbx ; popq %rdx ; popq %rcx ; popq %rax
-#ifdef USER32
-#define PUSH_FSGS \
- pushq %fs ;\
- pushq %gs ;\
- subq $16,%rsp
-#else
-#define PUSH_FSGS \
- subq $32,%rsp
-#endif
-
-#ifdef USER32
-#define POP_FSGS \
- popq %gs ;\
- popq %fs ;\
- addq $16,%rsp
-#else
-#define POP_FSGS \
- addq $32,%rsp
-#endif
-
-#ifdef USER32
-#define PUSH_FSGS_ISR \
- pushq %fs ;\
- pushq %gs
-#else
-#define PUSH_FSGS_ISR \
- subq $16,%rsp
-#endif
-
-#ifdef USER32
-#define POP_FSGS_ISR \
- popq %gs ;\
- popq %fs
-#else
-#define POP_FSGS_ISR \
- addq $16,%rsp
-#endif
-
Why did you remove these? The idea was that they could now be empty on
full 64-bit, and encapsulating the segment handling code for 32-bit,
which is repeated in various places... One addition could also be to set
the kernel segments after saving the user ones, or are there some more
variations of them?
/*
* Fault recovery.
@@ -368,14 +329,17 @@ ENTRY(t_segnp)
/* indicate fault type */
trap_check_kernel_exit:
+#ifdef USER32
testq $(EFL_VM),32(%rsp) /* is trap from V86 mode? */
jnz EXT(alltraps) /* isn`t kernel trap if so */
+#endif
/* Note: handling KERNEL_RING value by hand */
testq $2,24(%rsp) /* is trap from kernel mode? */
jnz EXT(alltraps) /* if so: */
/* check for the kernel exit sequence */
cmpq $_kret_iret,16(%rsp) /* on IRET? */
je fault_iret
+#ifdef USER32
#if 0
cmpq $_kret_popl_ds,16(%rsp) /* popping DS? */
je fault_popl_ds
@@ -386,6 +350,7 @@ trap_check_kernel_exit:
je fault_popl_fs
cmpq $_kret_popl_gs,16(%rsp) /* popping GS? */
je fault_popl_gs
+#endif
take_fault: /* if none of the above: */
jmp EXT(alltraps) /* treat as normal trap. */
@@ -414,6 +379,7 @@ fault_iret:
popq %rax /* restore eax */
jmp EXT(alltraps) /* take fault */
+#ifdef USER32
/*
* Fault restoring a segment register. The user's registers are still
* saved on the stack. The offending segment register has not been
@@ -446,13 +412,11 @@ push_fs:
pushq %fs /* restore fs, */
push_gs:
pushq %gs /* restore gs. */
-push_gsbase:
- pushq $0
- pushq $0
push_segregs:
movq %rax,R_TRAPNO(%rsp) /* set trap number */
movq %rdx,R_ERR(%rsp) /* set error code */
jmp trap_set_segs /* take trap */
+#endif
/*
* Debug trap. Check for single-stepping across system call into
@@ -462,8 +426,10 @@ push_segregs:
*/
ENTRY(t_debug)
INT_FIX
+#ifdef USER32
testq $(EFL_VM),16(%rsp) /* is trap from V86 mode? */
jnz 0f /* isn`t kernel trap if so */
+#endif
/* Note: handling KERNEL_RING value by hand */
testq $2,8(%rsp) /* is trap from kernel mode? */
jnz 0f /* if so: */
@@ -510,11 +476,13 @@ ENTRY(t_page_fault)
ENTRY(alltraps)
pusha /* save the general registers */
trap_push_segs:
+#ifdef USER32
movq %ds,%rax /* and the segment registers */
pushq %rax
movq %es,%rax /* and the segment registers */
pushq %rax
- PUSH_FSGS
+ pushq %fs
+ pushq %gs
/* Note that we have to load the segment registers
even if this is a trap from the kernel,
@@ -523,14 +491,15 @@ trap_push_segs:
mov %ss,%ax /* switch to kernel data segment */
mov %ax,%ds /* (same as kernel stack segment) */
mov %ax,%es
-#ifdef USER32
mov %ax,%fs
mov %ax,%gs
#endif
trap_set_segs:
cld /* clear direction flag */
+#ifdef USER32
testl $(EFL_VM),R_EFLAGS(%rsp) /* in V86 mode? */
jnz trap_from_user /* user mode trap if so */
+#endif
/* Note: handling KERNEL_RING value by hand */
testb $2,R_CS(%rsp) /* user mode trap? */
jz trap_from_kernel /* kernel trap if not */
@@ -580,23 +549,18 @@ _return_to_user:
*/
_return_from_kernel:
- addq $16,%rsp /* skip FS/GS base */
-#ifndef USER32
-_kret_popl_gs:
-_kret_popl_fs:
- addq $16,%rsp /* skip FS/GS selector */
-#else
+#ifdef USER32
_kret_popl_gs:
popq %gs /* restore segment registers */
_kret_popl_fs:
popq %fs
-#endif
_kret_popl_es:
popq %rax
movq %rax,%es
_kret_popl_ds:
popq %rax
movq %rax,%ds
+#endif
popa /* restore general registers */
addq $16,%rsp /* discard trap number and error code */
_kret_iret:
@@ -742,16 +706,17 @@ ENTRY(all_intrs)
cmpq %ss:EXT(int_stack_base),%rdx
je int_from_intstack /* if not: */
+#ifdef USER32
movq %ds,%rdx /* save segment registers */
pushq %rdx
movq %es,%rdx
pushq %rdx
- PUSH_FSGS_ISR
+ pushq %fs
+ pushq %gs
> mov %ss,%dx /* switch to kernel segments */
mov %dx,%ds
mov %dx,%es
-#ifdef USER32
mov %dx,%fs
mov %dx,%gs
#endif
@@ -784,8 +749,10 @@ LEXT(return_to_iret) /* ( label for
kdb_kintr and hardclock) */
popq %rsp /* switch back to old stack */
+#ifdef USER32
testl $(EFL_VM),I_EFL(%rsp) /* if in V86 */
jnz 0f /* or */
+#endif
/* Note: handling KERNEL_RING value by hand */
testb $2,I_CS(%rsp) /* user mode, */
jz 1f /* check for ASTs */
@@ -793,11 +760,14 @@ LEXT(return_to_iret) /* ( label for
kdb_kintr and hardclock) */
cmpq $0,CX(EXT(need_ast),%edx)
jnz ast_from_interrupt /* take it if so */
1:
- POP_FSGS_ISR
+#ifdef USER32
+ popq %gs
+ popq %fs
pop %rdx
mov %rdx,%es
pop %rdx
mov %rdx,%ds
+#endif
pop %r11
pop %r10
pop %r9
@@ -847,11 +817,14 @@ stack_overflowed:
* ss
*/
ast_from_interrupt:
- POP_FSGS_ISR
+#ifdef USER32
+ popq %gs
+ popq %fs
pop %rdx
mov %rdx,%es
pop %rdx
mov %rdx,%ds
+#endif
popq %r11
popq %r10
popq %r9
@@ -864,16 +837,18 @@ ast_from_interrupt:
pushq $0 /* zero code */
pushq $0 /* zero trap number */
pusha /* save general registers */
+
+#ifdef USER32
mov %ds,%rdx /* save segment registers */
push %rdx
mov %es,%rdx
push %rdx
- PUSH_FSGS_ISR
+ pushq %fs
+ pushq %gs
mov %ss,%dx /* switch to kernel segments */
mov %dx,%ds
mov %dx,%es
-#ifdef USER32
mov %dx,%fs
mov %dx,%gs
#endif
@@ -994,20 +969,26 @@ kdb_from_iret_i: /* on interrupt stack */
pushq $0 /* zero error code */
pushq $0 /* zero trap number */
pusha /* save general registers */
+#ifdef USER32
mov %ds,%rdx /* save segment registers */
push %rdx
mov %es,%rdx
push %rdx
- PUSH_FSGS
+ pushq %fs
+ pushq %gs
+#endif
movq %rsp,%rdx /* pass regs, */
movq $0,%rsi /* code, */
movq $-1,%rdi /* type to kdb */
call EXT(kdb_trap)
- POP_FSGS
+#ifdef USER32
+ popq %gs
+ popq %fs
pop %rdx
mov %rdx,%es
pop %rdx
mov %rdx,%ds
+#endif
popa /* restore general registers */
addq $16,%rsp
@@ -1082,23 +1063,27 @@ ttd_from_iret_i: /* on interrupt stack */
pushq $0 /* zero error code */
pushq $0 /* zero trap number */
pusha /* save general registers */
+#ifdef USER32
mov %ds,%rdx /* save segment registers */
push %rdx
mov %es,%rdx
push %rdx
push %fs
push %gs
+#endif
ud2 // TEST it
movq %rsp,%rdx /* pass regs, */
movq $0,%rsi /* code, */
movq $-1,%rdi /* type to kdb */
call _kttd_trap
+#ifdef USER32
pop %gs /* restore segment registers */
pop %fs
pop %rdx
mov %rdx,%es
pop %rdx
mov %rdx,%ds
+#endif
popa /* restore general registers */
addq $16,%rsp
@@ -1137,8 +1122,6 @@ syscall_entry_2:
pushq %rdx
pushq %fs
pushq %gs
- pushq $0 // gsbase
- pushq $0 // fsbase
mov %ss,%dx /* switch to kernel data segment */
mov %dx,%ds