qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 0/6] pc: bring ACPI table size below to 2.0 leve


From: Paolo Bonzini
Subject: Re: [Qemu-devel] [PATCH 0/6] pc: bring ACPI table size below to 2.0 levels, try fixing -initrd for good
Date: Fri, 19 Sep 2014 15:09:12 +0200
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.0

Il 19/09/2014 09:36, Gerd Hoffmann ha scritto:
>   Hi,
> 
>> However, there is another problem.  As the ACPI tables grow, we need
>> to move the address at which linuxboot.bin loads the initrd.  This
>> address is placed close to the end of memory, but it is QEMU that
>> tells linuxboot.bin where exactly the initrd is to be loaded.  And
>> QEMU cannot really know how much high memory SeaBIOS will use, because
>> QEMU does not know the final e820 memory map.
>>
>> The solution would be to let linuxboot.bin parse the memory map and
>> ignore the suggested initrd base address, but that's tedious.  In the
>> meanwhile, we can just assume that most of the need comes from the ACPI
>> tables (which is in fact true: patch 3 adds a fixed 32k extra just in
>> case) and dynamically resize the padding.
> 
> Hmm.  That assumes we are running seabios, where we know how much memory
> we actually need.
> 
> IMHO we should either really parse the memory map, or reserve more
> space.
> 
> IIRC it doesn't matter that much where we load the initrd.  It should
> not be just after the kernel, because the kernel needs some space to
> unpack itself and for early allocations such as initial page tables.
> This is where the common practice to load the initrd high comes from.
> But whenever we leave 128k or 16m between initrd and top-of-memory
> doesn't make much of a difference.

Ok, I wrote the e820 scanning code, and it works with KVM but it hits
a TCG bug.  The rep/movsb in SeaBIOS's e820 routine just doesn't write to
es:di.  The TCG ops seem sane:

 set_label $0x1
 ext16u_i64 tmp2,rsi
 ld_i64 tmp3,env,$0x108                  // load ds base
 add_i64 tmp2,tmp2,tmp3
 ext32u_i64 tmp2,tmp2
 qemu_ld_i64 tmp0,tmp2,ub,$0x2           // load into tmp0

 ext16u_i64 tmp2,rdi
 ld_i64 tmp3,env,$0xc0                   // load es base
 add_i64 tmp2,tmp2,tmp3
 ext32u_i64 tmp2,tmp2
 qemu_st_i64 tmp0,tmp2,ub,$0x2           // store from tmp0

 ld32s_i64 tmp0,env,$0xac                // increase rsi/rdi
 add_i64 tmp3,rsi,tmp0 
 deposit_i64 rsi,rsi,tmp3,$0x0,$0x10
 add_i64 tmp3,rdi,tmp0
 deposit_i64 rdi,rdi,tmp3,$0x0,$0x10

 movi_i64 tmp13,$0xffffffffffffffff      // decrement rcx
 add_i64 tmp3,rcx,tmp13
 deposit_i64 rcx,rcx,tmp3,$0x0,$0x10

 goto_tb $0x0
 movi_i64 tmp3,$0xf7b4
 st_i64 tmp3,env,$0x80
 exit_tb $0x7fe8a2c167a0
 set_label $0x0
 exit_tb $0x7fe8a2c167a3

For now I'm giving up, here is the patch just in case.  It also fails with
2.1.1.

There is some debugging output that goes to the serial port.  With KVM
it prints 1/2/2/1/2/2, while with TCG it prints 0/0/0/0/0 (it should
print 1/2/2/1/2 instead).


diff --git a/pc-bios/optionrom/linuxboot.S b/pc-bios/optionrom/linuxboot.S
index 748c831..e6f1be1 100644
--- a/pc-bios/optionrom/linuxboot.S
+++ b/pc-bios/optionrom/linuxboot.S
@@ -76,6 +76,96 @@ boot_kernel:
 
 
 copy_kernel:
+       push            %ds
+       pop             %es
+
+       /* Compute initrd address */
+       mov             $0xe801, %ax
+       xor             %cx, %cx
+       xor             %dx, %dx
+       int             $0x15
+
+       /* Output could be in AX/BX or CX/DX */
+       or              %cx, %cx
+       jnz             1f
+       or              %dx, %dx
+       jnz             1f
+       mov             %ax, %cx
+       mov             %bx, %dx
+1:
+
+       or              %dx, %dx
+       jnz             2f
+       addw            $1024, %cx            /* add 1 MB */
+       movzwl          %cx, %ebp
+       shll            $10, %ebp             /* convert to bytes */
+       jmp             mmap_loop_start
+
+2:
+       addw            $16777216 >> 16, %dx  /* add 16 MB */
+       movzwl          %dx, %ebp
+       shll            $16, %ebp             /* convert to bytes */
+
+       /* EBP (end of memory) is a hint to the loop below, that computes the
+          final location using the e820 memory map.  O(n^2) loop, but e820
+          is small anyway.  */
+
+mmap_loop_start:
+       movl            %ebp, %esi            /* ESI = end of memory */
+
+       read_fw         FW_CFG_INITRD_SIZE
+       subl            %eax, %ebp            /* EBP = start of initrd */
+       andl            $-4096, %ebp
+
+       xor             %ebx, %ebx
+
+       /* now move it further down according to the indications of the e820
+          memory map... */
+mmap_loop:
+       mov             $0xe820, %ax
+       mov             $0x534D4150, %edx
+       mov             $24, %ecx
+       mov             $e820, %edi
+       int             $0x15
+       jc              mmap_done             /* if at end of list, we're done 
*/
+       cmp             $0x534D4150, %eax     /* if BIOS broken, exit */
+       jnz             mmap_done
+       or              %ebx, %ebx            /* another check for end of list 
*/
+       jz              mmap_done
+
+mov 16(%di), %al
+mov $0x3f8, %dx
+add $0x30, %al
+out %al, %dx
+mov $0xd, %al
+out %al, %dx
+mov $0xa, %al
+out %al, %dx
+
+       jcxz            mmap_loop             /* ignore empty entries */
+       cmpb            $1, 16(%di)           /* only process reserved regions 
*/
+       je              mmap_loop
+       cmpl            $0, 4(%di)            /* only process low memory */
+       jne             mmap_loop
+       cmpl            %esi, 0(%di)
+       jae             mmap_loop
+
+       movl            8(%di), %ecx          /* ECX = region size */
+       jecxz           mmap_loop             /* ignore empty regions */
+
+       /* Valid low memory region.  Check if it overlaps EBP..ESI */
+
+       addl            0(%di), %ecx          /* ECX = end of region */
+       cmp             %ebp, %ecx            /* not if end <= initrd_start */
+       jbe             mmap_loop
+
+       /* Cannot put initrd here, try lowering the top of memory */
+
+       movl            0(%di), %ebp
+       jmp             mmap_loop_start
+
+mmap_done:
+       mov             %ebp, %edi            /* EDI = start of initrd */
 
        /* We need to load the kernel into memory we can't access in 16 bit
           mode, so let's get into 32 bit mode, write the kernel and jump
@@ -108,10 +198,18 @@ copy_kernel:
        /* We're now running in 16-bit CS, but 32-bit ES! */
 
        /* Load kernel and initrd */
+       pushl           %edi
+       read_fw_blob_addr32_edi(FW_CFG_INITRD)
        read_fw_blob_addr32(FW_CFG_KERNEL)
-       read_fw_blob_addr32(FW_CFG_INITRD)
        read_fw_blob_addr32(FW_CFG_CMDLINE)
-       read_fw_blob_addr32(FW_CFG_SETUP)
+
+       read_fw         FW_CFG_SETUP_ADDR
+       mov             %eax, %edi
+       mov             %eax, %ebx
+       read_fw_blob_addr32_edi(FW_CFG_SETUP)
+
+       /* Update the header with the initrd address we chose above */
+       popl            %es:0x218(%ebx)
 
        /* And now jump into Linux! */
        mov             $0, %eax
@@ -136,4 +234,9 @@ gdt:
        /* 0x10: data segment (base=0, limit=0xfffff, type=32bit data 
read/write, DPL=0, 4k) */
 .byte  0xff, 0xff, 0x00, 0x00, 0x00, 0x92, 0xcf, 0x00
 
+e820:
+.byte  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+.byte  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+.byte  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+
 BOOT_ROM_END
diff --git a/pc-bios/optionrom/optionrom.h b/pc-bios/optionrom/optionrom.h
index ce43608..f1a9021 100644
--- a/pc-bios/optionrom/optionrom.h
+++ b/pc-bios/optionrom/optionrom.h
@@ -51,8 +51,6 @@
 .endm
 
 #define read_fw_blob_pre(var)                          \
-       read_fw         var ## _ADDR;                   \
-       mov             %eax, %edi;                     \
        read_fw         var ## _SIZE;                   \
        mov             %eax, %ecx;                     \
        mov             $var ## _DATA, %ax;             \
@@ -68,6 +66,8 @@
  * Clobbers:   %eax, %edx, %es, %ecx, %edi
  */
 #define read_fw_blob(var)                              \
+       read_fw         var ## _ADDR;                   \
+       mov             %eax, %edi;                     \
        read_fw_blob_pre(var);                          \
        /* old as(1) doesn't like this insn so emit the bytes instead: \
        rep insb        (%dx), %es:(%edi);              \
@@ -80,7 +80,22 @@
  *
  * Clobbers:   %eax, %edx, %es, %ecx, %edi
  */
-#define read_fw_blob_addr32(var)                               \
+#define read_fw_blob_addr32(var)                       \
+       read_fw         var ## _ADDR;                   \
+       mov             %eax, %edi;                     \
+       read_fw_blob_pre(var);                          \
+       /* old as(1) doesn't like this insn so emit the bytes instead: \
+       addr32 rep insb (%dx), %es:(%edi);              \
+       */                                              \
+       .dc.b           0x67,0xf3,0x6c
+
+/*
+ * Read a blob from the fw_cfg device in forced addr32 mode, address is in 
%edi.
+ * Requires _SIZE and _DATA values for the parameter.
+ *
+ * Clobbers:   %eax, %edx, %edi, %es, %ecx
+ */
+#define read_fw_blob_addr32_edi(var)                   \
        read_fw_blob_pre(var);                          \
        /* old as(1) doesn't like this insn so emit the bytes instead: \
        addr32 rep insb (%dx), %es:(%edi);              \




reply via email to

[Prev in Thread] Current Thread [Next in Thread]