bug-hurd
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [VERY RFC PATCH] x86_64: Rework storing segment registers


From: Luca
Subject: Re: [VERY RFC PATCH] x86_64: Rework storing segment registers
Date: Thu, 15 Jun 2023 19:02:39 +0200

Hi Sergey,

Il 15/06/23 16:54, Sergey Bugaev ha scritto:
* For USER32, don't store fs/gs base at all
* For !USER32, store fs/gs base outside of the PCB stack
* For !USER32, don't store or touch es, ds, fs, gs (but keep ss and cs)
* For !USER32, disable all of the v86 code
---

Incidentally I have a patch very very similar to this that I'm testing in various configurations (also thanks to your suggestion about the stack corruption!), although in my case I had to disable some more code (e.g. in db_trace.c, db_interface.c, probably because I have kdb enabled).

So I went ahead and just made x86_64 !USER32 not store or access those
segment registers, along with moving fs/gs base out of iss and disabling
v86 (not that I know what v86 is, but it sounds like something we don't
need to support considering we only allow running x86_64 code).

I think V86 is not even used on the 32-bit build, apparently it is a special protected mode to support older x86 programs...

I have only tested my configuration (x86_64 !USER32 !MACH_KDB) -- quite
likely this doesn't build or work in others; but for me it seems to work
very well and I haven't got a single crash, in kernel space or user
space.

Please do review!

debootstrap is still not quite happy. I've uploaded the log here: [0]

[0]: https://paste.gg/p/anonymous/c976008dc38342cd963b0778586ead19

That's a bit strange, with my kernel I can now successfully end the second stage, and without these warnings. Strangely, I don't see the "Setting up translators: " line...

On the other hand, I had to fix a tricky combination of syscalls and interrupts, that for some reason was triggered quite often and caused a triple fault, together with an issue with nested interrupts (which I'm not sure are still used, but the code to handle it looks weird).

How are you launching the second stage? For now I used Samuel's archive as ramdisk, then launched bash and then debootstrap --second-stage:

https://paste.debian.net/1283107/


  i386/i386/debug_i386.c |   2 -
  i386/i386/i386asm.sym  |   4 +-
  i386/i386/pcb.c        |  30 ++++++++----
  i386/i386/thread.h     |  27 +++++++++--
  x86_64/locore.S        | 105 +++++++++++++++++------------------------
  5 files changed, 90 insertions(+), 78 deletions(-)

diff --git a/i386/i386/debug_i386.c b/i386/i386/debug_i386.c
index b5465796..41d032e3 100644
--- a/i386/i386/debug_i386.c
+++ b/i386/i386/debug_i386.c
@@ -40,8 +40,6 @@ void dump_ss(const struct i386_saved_state *st)
                st->r8, st->r9, st->r10, st->r11);
        printf("R12 %016lx R13 %016lx R14 %016lx R15 %016lx\n",
                st->r12, st->r13, st->r14, st->r15);
-       printf("FSBASE %016lx GSBASE %016lx\n",
-               st->fsbase, st->gsbase);
        printf("RIP %016lx EFLAGS %08lx\n", st->eip, st->efl);
  #else
        printf("EAX %08lx EBX %08lx ECX %08lx EDX %08lx\n",
diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym
index fd0be557..8af0c5d6 100644
--- a/i386/i386/i386asm.sym
+++ b/i386/i386/i386asm.sym
@@ -84,8 +84,10 @@ size i386_kernel_state       iks
size i386_exception_link iel +#if !defined(__x86_64__) || defined(USER32)

I'm wondering if we could simplify this case by defining USER32 also on i386.

  offset        i386_saved_state        r       gs
  offset        i386_saved_state        r       fs
+#endif
  offset        i386_saved_state        r       cs
  offset        i386_saved_state        r       uesp
  offset        i386_saved_state        r       eax
@@ -108,8 +110,6 @@ offset      i386_saved_state        r       r12
  offset        i386_saved_state        r       r13
  offset        i386_saved_state        r       r14
  offset        i386_saved_state        r       r15
-offset i386_saved_state        r       fsbase
-offset i386_saved_state        r       gsbase
  #endif
offset i386_interrupt_state i eip
diff --git a/i386/i386/pcb.c b/i386/i386/pcb.c
index fb535709..d8987ddf 100644
--- a/i386/i386/pcb.c
+++ b/i386/i386/pcb.c
@@ -145,9 +145,14 @@ void switch_ktss(pcb_t pcb)
         *      won`t save the v86 segments, so we leave room.
         */
+#if !defined(__x86_64__) || defined(USER32)
        pcb_stack_top = (pcb->iss.efl & EFL_VM)
                        ? (long) (&pcb->iss + 1)
                        : (long) (&pcb->iss.v86_segs);
+#else
+       pcb_stack_top = (vm_offset_t) (&pcb->iss + 1);
+#endif
+
  #ifdef __x86_64__
        assert((pcb_stack_top & 0xF) == 0);
  #endif
@@ -224,8 +229,8 @@ void switch_ktss(pcb_t pcb)
  #endif /* MACH_PV_DESCRIPTORS */
#if defined(__x86_64__) && !defined(USER32)
-       wrmsr(MSR_REG_FSBASE, pcb->iss.fsbase);
-       wrmsr(MSR_REG_GSBASE, pcb->iss.gsbase);
+       wrmsr(MSR_REG_FSBASE, pcb->isbs.fsbase);
+       wrmsr(MSR_REG_GSBASE, pcb->isbs.gsbase);
  #endif
db_load_context(pcb);
@@ -412,10 +417,12 @@ void pcb_init(task_t parent_task, thread_t thread)
         */
        pcb->iss.cs = USER_CS;
        pcb->iss.ss = USER_DS;
+#if !defined(__x86_64__) || defined(USER32)
        pcb->iss.ds = USER_DS;
        pcb->iss.es = USER_DS;
        pcb->iss.fs = USER_DS;
        pcb->iss.gs = USER_DS;
+#endif
        pcb->iss.efl = EFL_USER_SET;
thread->pcb = pcb;
@@ -477,6 +484,7 @@ kern_return_t thread_setstatus(
state = (struct i386_thread_state *) tstate; +#if !defined(__x86_64__) || defined(USER32)
                if (flavor == i386_REGS_SEGS_STATE) {
                    /*
                     * Code and stack selectors must not be null,
@@ -494,6 +502,7 @@ kern_return_t thread_setstatus(
                     || state->ss == 0 || (state->ss & SEL_PL) != SEL_PL_U)
                        return KERN_INVALID_ARGUMENT;
                }
+#endif
saved_state = USER_REGS(thread); @@ -532,7 +541,6 @@ kern_return_t thread_setstatus(
                saved_state->eip = state->eip;
                saved_state->efl = (state->efl & ~EFL_USER_CLEAR)
                                    | EFL_USER_SET;
-#endif /* __x86_64__ && !USER32 */
/*
                 * Segment registers.  Set differently in V8086 mode.
@@ -590,6 +598,7 @@ kern_return_t thread_setstatus(
                    saved_state->fs = state->fs;
                    saved_state->gs = state->gs;
                }
+#endif /* __x86_64__ && !USER32 */
                break;
            }
@@ -631,6 +640,7 @@ kern_return_t thread_setstatus(
                break;
            }
+#if !defined(__x86_64__) || defined(USER32)
            case i386_V86_ASSIST_STATE:
            {
                struct i386_v86_assist_state *state;
@@ -657,7 +667,7 @@ kern_return_t thread_setstatus(
                        USER_REGS(thread)->efl & (EFL_TF | EFL_IF);
                break;
            }
-
+#endif
            case i386_DEBUG_STATE:
            {
                struct i386_debug_state *state;
@@ -680,8 +690,8 @@ kern_return_t thread_setstatus(
                              return KERN_INVALID_ARGUMENT;
state = (struct i386_fsgs_base_state *) tstate;
-                    thread->pcb->iss.fsbase = state->fs_base;
-                    thread->pcb->iss.gsbase = state->gs_base;
+                    thread->pcb->isbs.fsbase = state->fs_base;
+                    thread->pcb->isbs.gsbase = state->gs_base;
                      if (thread == current_thread()) {
                              wrmsr(MSR_REG_FSBASE, state->fs_base);
                              wrmsr(MSR_REG_GSBASE, state->gs_base);
@@ -766,7 +776,6 @@ kern_return_t thread_getstatus(
                state->uesp = saved_state->uesp;
                state->efl = saved_state->efl;
                state->esp = 0;      /* unused */
-#endif /* __x86_64__ && !USER32 */
state->cs = saved_state->cs;
                state->ss = saved_state->ss;
@@ -798,6 +807,7 @@ kern_return_t thread_getstatus(
                    state->fs = saved_state->fs & 0xffff;
                    state->gs = saved_state->gs & 0xffff;
                }
+#endif /* __x86_64__ && !USER32 */
                *count = i386_THREAD_STATE_COUNT;
                break;
            }
@@ -836,6 +846,7 @@ kern_return_t thread_getstatus(
                break;
            }
+#if !defined(__x86_64__) || defined(USER32)
            case i386_V86_ASSIST_STATE:
            {
                struct i386_v86_assist_state *state;
@@ -850,6 +861,7 @@ kern_return_t thread_getstatus(
                *count = i386_V86_ASSIST_STATE_COUNT;
                break;
            }
+#endif
case i386_DEBUG_STATE:
            {
@@ -872,8 +884,8 @@ kern_return_t thread_getstatus(
                              return KERN_INVALID_ARGUMENT;
state = (struct i386_fsgs_base_state *) tstate;
-                    state->fs_base = thread->pcb->iss.fsbase;
-                    state->gs_base = thread->pcb->iss.gsbase;
+                    state->fs_base = thread->pcb->isbs.fsbase;
+                    state->gs_base = thread->pcb->isbs.gsbase;
                      *count = i386_FSGS_BASE_STATE_COUNT;
                      break;
              }
diff --git a/i386/i386/thread.h b/i386/i386/thread.h
index b5fc5ffb..eab762dc 100644
--- a/i386/i386/thread.h
+++ b/i386/i386/thread.h
@@ -51,14 +51,13 @@
   */
struct i386_saved_state {
-#ifdef __x86_64__
-       unsigned long   fsbase;
-       unsigned long   gsbase;
-#endif
+#if !defined(__x86_64__) || defined(USER32)
        unsigned long   gs;
        unsigned long   fs;
        unsigned long   es;
        unsigned long   ds;
+#endif
+
  #ifdef __x86_64__
        unsigned long   r15;
        unsigned long   r14;
@@ -85,12 +84,15 @@ struct i386_saved_state {
        unsigned long   efl;
        unsigned long   uesp;
        unsigned long   ss;
+
+#if !defined(__x86_64__) || defined(USER32)
        struct v86_segs {
            unsigned long v86_es;       /* virtual 8086 segment registers */
            unsigned long v86_ds;
            unsigned long v86_fs;
            unsigned long v86_gs;
        } v86_segs;
+#endif
  };
/*
@@ -144,6 +146,7 @@ struct i386_fpsave_state {
        };
  };
+#if !defined(__x86_64__) || defined(USER32)
  /*
   *    v86_assist_state:
   *
@@ -157,6 +160,7 @@ struct v86_assist_state {
        unsigned short          flags;  /* 8086 flag bits */
  };
  #define       V86_IF_PENDING          0x8000  /* unused bit */
+#endif
/*
   *    i386_interrupt_state:
@@ -167,10 +171,13 @@ struct v86_assist_state {
   */
struct i386_interrupt_state {
+#if !defined(__x86_64__) || defined(USER32)
        long    gs;
        long    fs;
        long    es;
        long    ds;
+#endif
+
  #ifdef __x86_64__
        long    r11;
        long    r10;
@@ -187,6 +194,13 @@ struct i386_interrupt_state {
        long    efl;
  };
+#if defined(__x86_64__) && !defined(USER32)
+struct i386_saved_fsgs_base_state {
+       unsigned long   fsbase;
+       unsigned long   gsbase;
+};
+#endif
+

These can maybe be placed in the i386_machine_state below

  /*
   *    i386_machine_state:
   *
@@ -197,7 +211,9 @@ struct i386_interrupt_state {
  struct i386_machine_state {
        struct user_ldt *       ldt;
        struct i386_fpsave_state *ifps;
+#if !defined(__x86_64__) || defined(USER32)
        struct v86_assist_state v86s;
+#endif
        struct real_descriptor user_gdt[USER_GDT_SLOTS];
        struct i386_debug_state ids;
  };
@@ -209,6 +225,9 @@ typedef struct pcb {
  #endif
        struct i386_saved_state iss;
        struct i386_machine_state ims;
+#if defined(__x86_64__) && !defined(USER32)
+       struct i386_saved_fsgs_base_state isbs;
+#endif
        decl_simple_lock_data(, lock)
        unsigned short init_control;            /* Initial FPU control to set */
  #ifdef LINUX_DEV
diff --git a/x86_64/locore.S b/x86_64/locore.S
index 4d61d618..fba2ad03 100644
--- a/x86_64/locore.S
+++ b/x86_64/locore.S
@@ -42,45 +42,6 @@
  #define pusha pushq %rax ; pushq %rcx ; pushq %rdx ; pushq %rbx ; subq 
$8,%rsp ; pushq %rbp ; pushq %rsi ; pushq %rdi ; pushq %r8 ; pushq %r9 ; pushq 
%r10 ; pushq %r11 ; pushq %r12 ; pushq %r13 ; pushq %r14 ; pushq %r15
  #define popa popq %r15 ; popq %r14 ; popq %r13 ; popq %r12 ; popq %r11 ; popq 
%r10 ; popq %r9 ; popq %r8 ; popq %rdi ; popq %rsi ; popq %rbp ; addq $8,%rsp ; 
popq %rbx ; popq %rdx ; popq %rcx ; popq %rax
-#ifdef USER32
-#define PUSH_FSGS              \
-       pushq   %fs             ;\
-       pushq   %gs             ;\
-       subq    $16,%rsp
-#else
-#define PUSH_FSGS              \
-       subq    $32,%rsp
-#endif
-
-#ifdef USER32
-#define POP_FSGS               \
-       popq    %gs             ;\
-       popq    %fs             ;\
-       addq    $16,%rsp
-#else
-#define POP_FSGS               \
-       addq    $32,%rsp
-#endif
-
-#ifdef USER32
-#define PUSH_FSGS_ISR          \
-       pushq   %fs             ;\
-       pushq   %gs
-#else
-#define PUSH_FSGS_ISR          \
-       subq    $16,%rsp
-#endif
-
-#ifdef USER32
-#define POP_FSGS_ISR           \
-       popq    %gs             ;\
-       popq    %fs
-#else
-#define POP_FSGS_ISR           \
-       addq    $16,%rsp
-#endif
-

Why did you remove these? The idea was that they could now be empty on full 64-bit, and encapsulating the segment handling code for 32-bit, which is repeated in various places... One addition could also be to set the kernel segments after saving the user ones, or are there some more variations of them?

/*
   * Fault recovery.
@@ -368,14 +329,17 @@ ENTRY(t_segnp)
                                        /* indicate fault type */
trap_check_kernel_exit:
+#ifdef USER32
        testq   $(EFL_VM),32(%rsp)      /* is trap from V86 mode? */
        jnz     EXT(alltraps)           /* isn`t kernel trap if so */
+#endif
        /* Note: handling KERNEL_RING value by hand */
        testq   $2,24(%rsp)             /* is trap from kernel mode? */
        jnz     EXT(alltraps)           /* if so:  */
                                        /* check for the kernel exit sequence */
        cmpq    $_kret_iret,16(%rsp)    /* on IRET? */
        je      fault_iret
+#ifdef USER32
  #if 0
        cmpq    $_kret_popl_ds,16(%rsp) /* popping DS? */
        je      fault_popl_ds
@@ -386,6 +350,7 @@ trap_check_kernel_exit:
        je      fault_popl_fs
        cmpq    $_kret_popl_gs,16(%rsp) /* popping GS? */
        je      fault_popl_gs
+#endif
  take_fault:                           /* if none of the above: */
        jmp     EXT(alltraps)           /* treat as normal trap. */
@@ -414,6 +379,7 @@ fault_iret:
        popq    %rax                    /* restore eax */
        jmp     EXT(alltraps)           /* take fault */
+#ifdef USER32
  /*
   * Fault restoring a segment register.  The user's registers are still
   * saved on the stack.  The offending segment register has not been
@@ -446,13 +412,11 @@ push_fs:
        pushq   %fs                     /* restore fs, */
  push_gs:
        pushq   %gs                     /* restore gs. */
-push_gsbase:
-       pushq   $0
-       pushq   $0
  push_segregs:
        movq    %rax,R_TRAPNO(%rsp)     /* set trap number */
        movq    %rdx,R_ERR(%rsp)        /* set error code */
        jmp     trap_set_segs           /* take trap */
+#endif
/*
   * Debug trap.  Check for single-stepping across system call into
@@ -462,8 +426,10 @@ push_segregs:
   */
  ENTRY(t_debug)
        INT_FIX
+#ifdef USER32
        testq   $(EFL_VM),16(%rsp)      /* is trap from V86 mode? */
        jnz     0f                      /* isn`t kernel trap if so */
+#endif
        /* Note: handling KERNEL_RING value by hand */
        testq   $2,8(%rsp)              /* is trap from kernel mode? */
        jnz     0f                      /* if so: */
@@ -510,11 +476,13 @@ ENTRY(t_page_fault)
  ENTRY(alltraps)
        pusha                           /* save the general registers */
  trap_push_segs:
+#ifdef USER32
        movq    %ds,%rax                        /* and the segment registers */
        pushq   %rax
        movq    %es,%rax                        /* and the segment registers */
        pushq   %rax
-       PUSH_FSGS
+       pushq   %fs
+       pushq   %gs
/* Note that we have to load the segment registers
           even if this is a trap from the kernel,
@@ -523,14 +491,15 @@ trap_push_segs:
        mov     %ss,%ax                 /* switch to kernel data segment */
        mov     %ax,%ds                 /* (same as kernel stack segment) */
        mov     %ax,%es
-#ifdef USER32
        mov     %ax,%fs
        mov     %ax,%gs
  #endif
  trap_set_segs:
        cld                             /* clear direction flag */
+#ifdef USER32
        testl   $(EFL_VM),R_EFLAGS(%rsp) /* in V86 mode? */
        jnz     trap_from_user          /* user mode trap if so */
+#endif
        /* Note: handling KERNEL_RING value by hand */
        testb   $2,R_CS(%rsp)           /* user mode trap? */
        jz      trap_from_kernel        /* kernel trap if not */
@@ -580,23 +549,18 @@ _return_to_user:
   */
_return_from_kernel:
-       addq    $16,%rsp                /* skip FS/GS base */
-#ifndef USER32
-_kret_popl_gs:
-_kret_popl_fs:
-       addq    $16,%rsp                /* skip FS/GS selector */
-#else
+#ifdef USER32
  _kret_popl_gs:
        popq    %gs                     /* restore segment registers */
  _kret_popl_fs:
        popq    %fs
-#endif
  _kret_popl_es:
        popq    %rax
        movq    %rax,%es
  _kret_popl_ds:
        popq    %rax
        movq    %rax,%ds
+#endif
        popa                            /* restore general registers */
        addq    $16,%rsp                /* discard trap number and error code */
  _kret_iret:
@@ -742,16 +706,17 @@ ENTRY(all_intrs)
        cmpq    %ss:EXT(int_stack_base),%rdx
        je      int_from_intstack       /* if not: */
+#ifdef USER32
        movq    %ds,%rdx                        /* save segment registers */
        pushq   %rdx
        movq    %es,%rdx
        pushq   %rdx
-       PUSH_FSGS_ISR
+       pushq   %fs
+       pushq   %gs
   >         mov     %ss,%dx                 /* switch to kernel segments */
        mov     %dx,%ds
        mov     %dx,%es
-#ifdef USER32
        mov     %dx,%fs
        mov     %dx,%gs
  #endif
@@ -784,8 +749,10 @@ LEXT(return_to_iret)                       /* ( label for 
kdb_kintr and hardclock) */
popq %rsp /* switch back to old stack */ +#ifdef USER32
        testl   $(EFL_VM),I_EFL(%rsp)   /* if in V86 */
        jnz     0f                      /* or */
+#endif
        /* Note: handling KERNEL_RING value by hand */
        testb   $2,I_CS(%rsp)           /* user mode, */
        jz      1f                      /* check for ASTs */
@@ -793,11 +760,14 @@ LEXT(return_to_iret)                      /* ( label for 
kdb_kintr and hardclock) */
        cmpq    $0,CX(EXT(need_ast),%edx)
        jnz     ast_from_interrupt      /* take it if so */
  1:
-       POP_FSGS_ISR
+#ifdef USER32
+       popq    %gs
+       popq    %fs
        pop     %rdx
        mov     %rdx,%es
        pop     %rdx
        mov     %rdx,%ds
+#endif
        pop     %r11
        pop     %r10
        pop     %r9
@@ -847,11 +817,14 @@ stack_overflowed:
   *    ss
   */
  ast_from_interrupt:
-       POP_FSGS_ISR
+#ifdef USER32
+       popq    %gs
+       popq    %fs
        pop     %rdx
        mov     %rdx,%es
        pop     %rdx
        mov     %rdx,%ds
+#endif
        popq    %r11
        popq    %r10
        popq    %r9
@@ -864,16 +837,18 @@ ast_from_interrupt:
        pushq   $0                      /* zero code */
        pushq   $0                      /* zero trap number */
        pusha                           /* save general registers */
+
+#ifdef USER32
        mov     %ds,%rdx                /* save segment registers */
        push    %rdx
        mov     %es,%rdx
        push    %rdx
-       PUSH_FSGS_ISR
+       pushq   %fs
+       pushq   %gs
mov %ss,%dx /* switch to kernel segments */
        mov     %dx,%ds
        mov     %dx,%es
-#ifdef USER32
        mov     %dx,%fs
        mov     %dx,%gs
  #endif
@@ -994,20 +969,26 @@ kdb_from_iret_i:                  /* on interrupt stack */
        pushq   $0                      /* zero error code */
        pushq   $0                      /* zero trap number */
        pusha                           /* save general registers */
+#ifdef USER32
        mov     %ds,%rdx                /* save segment registers */
        push    %rdx
        mov     %es,%rdx
        push    %rdx
-       PUSH_FSGS
+       pushq   %fs
+       pushq   %gs
+#endif
        movq    %rsp,%rdx               /* pass regs, */
        movq    $0,%rsi                 /* code, */
        movq    $-1,%rdi                /* type to kdb */
        call    EXT(kdb_trap)
-       POP_FSGS
+#ifdef USER32
+       popq    %gs
+       popq    %fs
        pop     %rdx
        mov     %rdx,%es
        pop     %rdx
        mov     %rdx,%ds
+#endif
        popa                            /* restore general registers */
        addq    $16,%rsp
@@ -1082,23 +1063,27 @@ ttd_from_iret_i: /* on interrupt stack */
        pushq   $0                      /* zero error code */
        pushq   $0                      /* zero trap number */
        pusha                           /* save general registers */
+#ifdef USER32
        mov     %ds,%rdx                /* save segment registers */
        push    %rdx
        mov     %es,%rdx
        push    %rdx
        push    %fs
        push    %gs
+#endif
        ud2     // TEST it
        movq    %rsp,%rdx               /* pass regs, */
        movq    $0,%rsi                 /* code, */
        movq    $-1,%rdi                /* type to kdb */
        call    _kttd_trap
+#ifdef USER32
        pop     %gs                     /* restore segment registers */
        pop     %fs
        pop     %rdx
        mov     %rdx,%es
        pop     %rdx
        mov     %rdx,%ds
+#endif
        popa                            /* restore general registers */
        addq    $16,%rsp
@@ -1137,8 +1122,6 @@ syscall_entry_2:
        pushq   %rdx
        pushq   %fs
        pushq   %gs
-       pushq   $0      // gsbase
-       pushq   $0      // fsbase
mov %ss,%dx /* switch to kernel data segment */
        mov     %dx,%ds




reply via email to

[Prev in Thread] Current Thread [Next in Thread]