qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH v2 for-2.3] numa: pc: fix default VCPU to node mappi


From: Igor Mammedov
Subject: [Qemu-devel] [PATCH v2 for-2.3] numa: pc: fix default VCPU to node mapping
Date: Wed, 18 Mar 2015 16:38:25 +0000

since commit
   dd0247e0 pc: acpi: mark all possible CPUs as enabled in SRAT
Linux kernel actually tries to use CPU to Node mapping from
QEMU provided SRAT table instead of discarding it, and that
in some cases breaks build_sched_domains() which expects
sane mapping where cores/threads belonging to the same socket
are on the same NUMA node.

With current default round-robin mapping of VCPUs to nodes
guest ends-up with cores/threads belonging to the same socket
being on different NUMA nodes.

For example with following CLI:
qemu-kvm -m 4G -smp 5,sockets=2,cores=4,threads=1,maxcpus=8 \
         -numa node,nodeid=0 -numa node,nodeid=1
2.6.32 based kernels will hang on boot due to incorrectly build
sched_group-s list in update_sd_lb_stats()
so comment in QEMU justifying dumb default mapping:
 "
  guest OSes must cope with this anyway, because there are BIOSes
  out there in real machines which also use this scheme.
 "
isn't really valid.

Replacing default mapping with a manual, where VCPUs belonging to
the same socket are on the same NUMA node, fixes issue for
guests which can't handle nonsense topology i.e. changing CLI to:
  -numa node,nodeid=0,cpus=0-3 -numa node,nodeid=1,cpus=4-7

So instead of simply scattering VCPUs around nodes, map
the same socket VCPUs to the same NUMA node, which is what
guest would expect from a sane hardware/BIOS.

Signed-off-by: Igor Mammedov <address@hidden>
---
v2:
  - add machine callback cpu_index_to_socket_id() and use it
    instead of stub approach
---
 hw/i386/pc.c          |  9 +++++++++
 include/hw/boards.h   |  5 +++++
 include/sysemu/numa.h |  3 ++-
 numa.c                | 18 +++++++++++++-----
 vl.c                  |  2 +-
 5 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 4b46c29..a52d2af 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1851,6 +1851,14 @@ static void pc_machine_initfn(Object *obj)
                              NULL, NULL);
 }
 
+static unsigned pc_cpu_index_to_socket_id(unsigned cpu_index)
+{
+    unsigned pkg_id, core_id, smt_id;
+    x86_topo_ids_from_idx(smp_cores, smp_threads, cpu_index,
+                          &pkg_id, &core_id, &smt_id);
+    return pkg_id;
+}
+
 static void pc_machine_class_init(ObjectClass *oc, void *data)
 {
     MachineClass *mc = MACHINE_CLASS(oc);
@@ -1859,6 +1867,7 @@ static void pc_machine_class_init(ObjectClass *oc, void 
*data)
 
     pcmc->get_hotplug_handler = mc->get_hotplug_handler;
     mc->get_hotplug_handler = pc_get_hotpug_handler;
+    mc->cpu_index_to_socket_id = pc_cpu_index_to_socket_id;
     hc->plug = pc_machine_device_plug_cb;
     hc->unplug_request = pc_machine_device_unplug_request_cb;
     hc->unplug = pc_machine_device_unplug_cb;
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 1feea2b..78838d1 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -82,6 +82,10 @@ bool machine_mem_merge(MachineState *machine);
  *    of HotplugHandler object, which handles hotplug operation
  *    for a given @dev. It may return NULL if @dev doesn't require
  *    any actions to be performed by hotplug handler.
+ * @cpu_index_to_socket_id:
+ *    used to provide @cpu_index to socket number mapping, allowing
+ *    a machine to group CPU threads belonging to the same socket/package
+ *    Returns: socket number given cpu_index belongs to.
  */
 struct MachineClass {
     /*< private >*/
@@ -118,6 +122,7 @@ struct MachineClass {
 
     HotplugHandler *(*get_hotplug_handler)(MachineState *machine,
                                            DeviceState *dev);
+    unsigned (*cpu_index_to_socket_id)(unsigned cpu_index);
 };
 
 /**
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 5633b85..6523b4d 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -6,6 +6,7 @@
 #include "qemu/option.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/hostmem.h"
+#include "hw/boards.h"
 
 extern int nb_numa_nodes;   /* Number of NUMA nodes */
 
@@ -16,7 +17,7 @@ typedef struct node_info {
     bool present;
 } NodeInfo;
 extern NodeInfo numa_info[MAX_NODES];
-void parse_numa_opts(void);
+void parse_numa_opts(MachineClass *mc);
 void numa_post_machine_init(void);
 void query_numa_node_mem(uint64_t node_mem[]);
 extern QemuOptsList qemu_numa_opts;
diff --git a/numa.c b/numa.c
index ffbec68..f1f571a 100644
--- a/numa.c
+++ b/numa.c
@@ -165,7 +165,7 @@ error:
     return -1;
 }
 
-void parse_numa_opts(void)
+void parse_numa_opts(MachineClass *mc)
 {
     int i;
 
@@ -233,13 +233,21 @@ void parse_numa_opts(void)
                 break;
             }
         }
-        /* assigning the VCPUs round-robin is easier to implement, guest OSes
-         * must cope with this anyway, because there are BIOSes out there in
-         * real machines which also use this scheme.
+        /* Historically VCPUs were assigned in round-robin order to NUMA
+         * nodes. However it causes issues with guest not handling it nice
+         * in case where cores/threads from a multicore CPU appear on
+         * different nodes. So allow boards to override default distribution
+         * rule grouping VCPUs by socket so that VCPUs from the same socket
+         * would be on the same node.
          */
         if (i == nb_numa_nodes) {
             for (i = 0; i < max_cpus; i++) {
-                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
+                unsigned node_id = i % nb_numa_nodes;
+                if (mc->cpu_index_to_socket_id) {
+                    node_id = mc->cpu_index_to_socket_id(i) % nb_numa_nodes;
+                }
+
+                set_bit(i, numa_info[node_id].node_cpu);
             }
         }
     }
diff --git a/vl.c b/vl.c
index 694deb4..ab02de9 100644
--- a/vl.c
+++ b/vl.c
@@ -4168,7 +4168,7 @@ int main(int argc, char **argv, char **envp)
     default_drive(default_floppy, snapshot, IF_FLOPPY, 0, FD_OPTS);
     default_drive(default_sdcard, snapshot, IF_SD, 0, SD_OPTS);
 
-    parse_numa_opts();
+    parse_numa_opts(machine_class);
 
     if (qemu_opts_foreach(qemu_find_opts("mon"), mon_init_func, NULL, 1) != 0) 
{
         exit(1);
-- 
1.8.3.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]