[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v5 10/14] pc: Add dimm paravirt SRAT info
From: |
Igor Mammedov |
Subject: |
Re: [Qemu-devel] [PATCH v5 10/14] pc: Add dimm paravirt SRAT info |
Date: |
Thu, 11 Jul 2013 07:13:39 +0200 |
On Wed, 10 Jul 2013 13:10:03 +0300
"Michael S. Tsirkin" <address@hidden> wrote:
> On Wed, Jun 26, 2013 at 05:13:33PM +0800, Hu Tao wrote:
> > The numa_fw_cfg paravirt interface is extended to include SRAT information
> > for
> > all hotplug-able dimms. There are 3 words for each hotplug-able memory slot,
> > denoting start address, size and node proximity. The new info is appended
> > after
> > existing numa info, so that the fw_cfg layout does not break. This
> > information
> > is used by Seabios to build hotplug memory device objects at runtime.
> > nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat
> > info
> > to SeaBIOS.
> >
> > v3->v4: numa_fw_cfg needs to be initalized after memory controller sets up
> > dimm
> > ranges. Make changes for pc_piix and pc_q35 to set numa_fw_cfg after i440fx
> > initialization.
> >
> > v2->v3: setting nb_numa_nodes to 1 is not needed
> >
> > v1->v2:
> > Dimm SRAT info (#dimms) is appended at end of existing numa fw_cfg in order
> > not
> > to break existing layout
> > Documentation of the new fwcfg layout is included in docs/specs/fwcfg.txt
> >
> > Signed-off-by: Vasilis Liaskovitis <address@hidden>
> > Signed-off-by: Hu Tao <address@hidden>
>
> Please do not add any more fwcfg interfaces - generating
> ACPI in qemu removes the need for it.
>
> So please rebase on top of that work and generate the appropriate ACPI
> tables directly.
>
> You can find the latest code gnerating ACPI from qemu here:
> git://git.kernel.org/pub/scm/virt/kvm/mst/qemu.git acpi
will it work with upstream Seabios or custom tree is required for it as well?
>
> This code is work in progress, but once you base on
> top of that, I can put it on that branch and keep updating if
> interfaces change.
>
> > ---
> > docs/specs/fwcfg.txt | 28 ++++++++++++++++++++++++++++
> > hw/i386/pc.c | 30 ++++++++++++++++++++++++------
> > hw/i386/pc_piix.c | 1 +
> > hw/i386/pc_q35.c | 7 +++++--
> > include/hw/i386/pc.h | 1 +
> > include/sysemu/sysemu.h | 1 +
> > 6 files changed, 60 insertions(+), 8 deletions(-)
> > create mode 100644 docs/specs/fwcfg.txt
> >
> > diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt
> > new file mode 100644
> > index 0000000..e6fcd8f
> > --- /dev/null
> > +++ b/docs/specs/fwcfg.txt
> > @@ -0,0 +1,28 @@
> > +QEMU<->BIOS Paravirt Documentation
> > +--------------------------------------
> > +
> > +This document describes paravirt data structures passed from QEMU to BIOS.
> > +
> > +fw_cfg SRAT paravirt info
> > +--------------------
> > +The SRAT info passed from QEMU to BIOS has the following layout:
> > +
> > +-----------------------------------------------------------------------------------------------
> > +#nodes | cpu0_pxm | cpu1_pxm | ... | cpulast_pxm | node0_mem | node1_mem |
> > ... | nodelast_mem
> > +
> > +-----------------------------------------------------------------------------------------------
> > +#dimms | dimm0_start | dimm0_sz | dimm0_pxm | ... | dimmlast_start |
> > dimmlast_sz | dimmlast_pxm
> > +
> > +Entry 0 contains the number of numa nodes (nb_numa_nodes).
> > +
> > +Entries 1..max_cpus: The next max_cpus entries describe node proximity for
> > each
> > +one of the vCPUs in the system.
> > +
> > +Entries max_cpus+1..max_cpus+nb_numa_nodes+1: The next nb_numa_nodes
> > entries
> > +describe the memory size for each one of the NUMA nodes in the system.
> > +
> > +Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms
> > (nb_hp_dimms)
> > +
> > +The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet
> > contains
> > +the physical address offset, size (in bytes), and node proximity for the
> > +respective dimm.
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index 65838a6..b51d3b5 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -55,6 +55,7 @@
> > #include "hw/acpi/acpi.h"
> > #include "hw/cpu/icc_bus.h"
> > #include "hw/boards.h"
> > +#include "hw/mem-hotplug/dimm.h"
> >
> > /* debug PC/ISA interrupts */
> > //#define DEBUG_IRQ
> > @@ -606,8 +607,6 @@ static FWCfgState *bochs_bios_init(void)
> > FWCfgState *fw_cfg;
> > uint8_t *smbios_table;
> > size_t smbios_len;
> > - uint64_t *numa_fw_cfg;
> > - int i, j;
> > unsigned int apic_id_limit = pc_apic_id_limit(max_cpus);
> >
> > fw_cfg = fw_cfg_init(BIOS_CFG_IOPORT, BIOS_CFG_IOPORT + 1, 0, 0);
> > @@ -640,11 +639,25 @@ static FWCfgState *bochs_bios_init(void)
> > &e820_table, sizeof(e820_table));
> >
> > fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, &hpet_cfg, sizeof(hpet_cfg));
> > +
> > + return fw_cfg;
> > +}
> > +
> > +void bochs_meminfo_bios_init(void *fw_cfg)
> > +{
> > + uint64_t *numa_fw_cfg;
> > + uint64_t *hp_dimms_fw_cfg;
> > + int i, j;
> > + unsigned int apic_id_limit = pc_apic_id_limit(max_cpus);
> > +
> > /* allocate memory for the NUMA channel: one (64bit) word for the
> > number
> > * of nodes, one word for each VCPU->node and one word for each node to
> > * hold the amount of memory.
> > + * Finally one word for the number of hotplug memory slots and three
> > words
> > + * for each hotplug memory slot (start address, size and node
> > proximity).
> > */
> > - numa_fw_cfg = g_new0(uint64_t, 1 + apic_id_limit + nb_numa_nodes);
> > + numa_fw_cfg = g_new0(uint64_t,
> > + 2 + apic_id_limit + nb_numa_nodes + 3 *
> > nb_hp_dimms);
> > numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
> > for (i = 0; i < max_cpus; i++) {
> > unsigned int apic_id = x86_cpu_apic_id_from_index(i);
> > @@ -659,11 +672,16 @@ static FWCfgState *bochs_bios_init(void)
> > for (i = 0; i < nb_numa_nodes; i++) {
> > numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(node_mem[i]);
> > }
> > +
> > + numa_fw_cfg[1 + apic_id_limit + nb_numa_nodes] =
> > cpu_to_le64(nb_hp_dimms);
> > +
> > + hp_dimms_fw_cfg = numa_fw_cfg + 2 + apic_id_limit + nb_numa_nodes;
> > + if (nb_hp_dimms) {
> > + dimm_setup_fwcfg_layout(hp_dimms_fw_cfg);
> > + }
> > fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg,
> > - (1 + apic_id_limit + nb_numa_nodes) *
> > + (2 + apic_id_limit + nb_numa_nodes + 3 * nb_hp_dimms)
> > *
> > sizeof(*numa_fw_cfg));
> > -
> > - return fw_cfg;
> > }
> >
> > static long get_file_size(FILE *f)
> > diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> > index fb056df..6e18343 100644
> > --- a/hw/i386/pc_piix.c
> > +++ b/hw/i386/pc_piix.c
> > @@ -138,6 +138,7 @@ static void pc_init1(MemoryRegion *system_memory,
> > if (!xen_enabled()) {
> > fw_cfg = pc_memory_init(kernel_filename, kernel_cmdline,
> > initrd_filename,
> > below_4g_mem_size, above_4g_mem_size);
> > + bochs_meminfo_bios_init(fw_cfg);
> > }
> >
> > if (kvm_irqchip_in_kernel()) {
> > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> > index 5fe14bb..2c14977 100644
> > --- a/hw/i386/pc_q35.c
> > +++ b/hw/i386/pc_q35.c
> > @@ -74,6 +74,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
> > ICH9LPCState *ich9_lpc;
> > PCIDevice *ahci;
> > DeviceState *icc_bridge;
> > + void *fw_cfg = NULL;
> >
> > icc_bridge = qdev_create(NULL, TYPE_ICC_BRIDGE);
> > object_property_add_child(qdev_get_machine(), "icc-bridge",
> > @@ -97,8 +98,9 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
> >
> > /* allocate ram and load rom/bios */
> > if (!xen_enabled()) {
> > - pc_memory_init(kernel_filename, kernel_cmdline,
> > - initrd_filename, below_4g_mem_size,
> > above_4g_mem_size);
> > + fw_cfg = pc_memory_init(kernel_filename, kernel_cmdline,
> > + initrd_filename, below_4g_mem_size,
> > + above_4g_mem_size);
> > }
> >
> > /* irq lines */
> > @@ -116,6 +118,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
> > q35_host->mch.address_space_io = get_system_io();
> > /* pci */
> > qdev_init_nofail(DEVICE(q35_host));
> > + bochs_meminfo_bios_init(fw_cfg);
> > host_bus = q35_host->host.pci.bus;
> > /* create ISA bus */
> > lpc = pci_create_simple_multifunction(host_bus, PCI_DEVFN(ICH9_LPC_DEV,
> > diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> > index 959b92b..4a29e6e 100644
> > --- a/include/hw/i386/pc.h
> > +++ b/include/hw/i386/pc.h
> > @@ -231,6 +231,7 @@ int pvpanic_init(ISABus *bus);
> > #define E820_UNUSABLE 5
> >
> > int e820_add_entry(uint64_t, uint64_t, uint32_t);
> > +void bochs_meminfo_bios_init(void *fw_cfg);
> >
> > #define PC_COMPAT_1_5 \
> > {\
> > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > index 2fb71af..2644faa 100644
> > --- a/include/sysemu/sysemu.h
> > +++ b/include/sysemu/sysemu.h
> > @@ -132,6 +132,7 @@ extern QEMUClock *rtc_clock;
> > extern int nb_numa_nodes;
> > extern uint64_t node_mem[MAX_NODES];
> > extern unsigned long *node_cpumask[MAX_NODES];
> > +extern int nb_hp_dimms;
> >
> > #define MAX_OPTION_ROMS 16
> > typedef struct QEMUOptionRom {
> > --
> > 1.8.3.1
> >
>