qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [RFC] x86: Allow to set NUMA distance for different NUMA no


From: He Chen
Subject: [Qemu-devel] [RFC] x86: Allow to set NUMA distance for different NUMA nodes
Date: Fri, 3 Mar 2017 13:01:44 +0800

Current, QEMU does not provide a clear command to set vNUMA distance for
guest although we already have `-numa` command to set vNUMA nodes.

vNUMA distance makes sense in certain scenario.
But now, if we create a guest that has 4 vNUMA nodes, when we check NUMA
info via `numactl -H`, we will see:

node distance:
node    0    1    2    3
  0:   10   20   20   20
  1:   20   10   20   20
  2:   20   20   10   20
  3:   20   20   20   10

Guest kernel regards all local node as distance 10, and all remote node
as distance 20 when there is no SLIT table since QEMU doesn't build it.
It looks like a little strange when you have seen the distance in an
actual physical machine that contains 4 NUMA nodes. My machine shows:

node distance:
node    0    1    2    3
  0:   10   21   31   41
  1:   21   10   21   31
  2:   31   21   10   21
  3:   41   31   21   10

To set vNUMA distance, guest should see a complete SLIT table.
I found QEMU has provide `-acpitable` command that allows users to add
a ACPI table into guest, but it requires users building ACPI table by
themselves first. Using `-acpitable` to add a SLIT table may be not so
straightforward or flexible, imagine that when the vNUMA configuration
is changes and we need to generate another SLIT table manually. It may
not be friendly to users or upper software like libvirt.

This RFC patch is going to add SLIT table support in QEMU, and provides
addtional option `distance` for command `-numa` to allow user set vNUMA
distance by QEMU command.

With this patch, when a user wants to create a guest that contains
several vNUMA nodes and also wants to set distance among those nodes,
the QEMU command would like:

```
-object 
memory-backend-ram,size=1G,prealloc=yes,host-nodes=0,policy=bind,id=node0 \
-numa 
node,nodeid=0,cpus=0,memdev=node0,distance=10,distance=21,distance=31,distance=41
 \
-object 
memory-backend-ram,size=1G,prealloc=yes,host-nodes=1,policy=bind,id=node1 \
-numa 
node,nodeid=1,cpus=1,memdev=node1,distance=21,distance=10,distance=21,distance=31
 \
-object 
memory-backend-ram,size=1G,prealloc=yes,host-nodes=2,policy=bind,id=node2 \
-numa 
node,nodeid=2,cpus=2,memdev=node2,distance=31,distance=21,distance=10,distance=21
 \
-object 
memory-backend-ram,size=1G,prealloc=yes,host-nodes=3,policy=bind,id=node3 \
-numa 
node,nodeid=3,cpus=3,memdev=node3,distance=41,distance=31,distance=21,distance=10
 \
```

As we can see, for each `-numa` command, we provide a series of
`distance` that indicate a vNUMA distance array. E.g. for vNode0, the
distance array is [10, 21, 31, 41] which means the distance between
vNode0 to vNode0 is 10, to vNode1 is 21 etc. For vNode 1, the array
is [21, 10, 21, 31] which means vNode1 to vNode0 is 21, to vNode1 is 10
etc.

BTW. Please forgive me that I don't do a pretty wrapper to `distance`
option.

What do you think of this RFC patch? And shall we consider to add vNUMA
distance support to QEMU?

Thanks,
-He

Signed-off-by: He Chen <address@hidden>
---
 hw/i386/acpi-build.c        | 28 ++++++++++++++++++++++++++++
 include/hw/acpi/acpi-defs.h |  9 +++++++++
 include/sysemu/numa.h       |  2 ++
 numa.c                      | 44 ++++++++++++++++++++++++++++++++++++++++++++
 qapi-schema.json            | 12 ++++++++----
 5 files changed, 91 insertions(+), 4 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 1c928ab..ee8236e 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2395,6 +2395,32 @@ build_srat(GArray *table_data, BIOSLinker *linker, 
MachineState *machine)
 }
 
 static void
+build_slit(GArray *table_data, BIOSLinker *linker, MachineState *machine)
+{
+    struct AcpiSystemLocalityDistanceTable *slit;
+    uint8_t *entry;
+    int slit_start, slit_data_len, i, j;
+    slit_start = table_data->len;
+
+    slit = acpi_data_push(table_data, sizeof(*slit));
+    slit->nb_localities = nb_numa_nodes;
+
+    slit_data_len = sizeof(uint8_t) * nb_numa_nodes * nb_numa_nodes;
+    entry = acpi_data_push(table_data, slit_data_len);
+
+    for (i = 0; i < nb_numa_nodes; i++) {
+        for (j = 0; j < nb_numa_nodes; j++) {
+            entry[i * nb_numa_nodes + j] = numa_info[i].distance[j];
+        }
+    }
+
+    build_header(linker, table_data,
+                 (void *)(table_data->data + slit_start),
+                 "SLIT",
+                 table_data->len - slit_start, 1, NULL, NULL);
+}
+
+static void
 build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info)
 {
     AcpiTableMcfg *mcfg;
@@ -2669,6 +2695,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState 
*machine)
     if (pcms->numa_nodes) {
         acpi_add_table(table_offsets, tables_blob);
         build_srat(tables_blob, tables->linker, machine);
+        acpi_add_table(table_offsets, tables_blob);
+        build_slit(tables_blob, tables->linker, machine);
     }
     if (acpi_get_mcfg(&mcfg)) {
         acpi_add_table(table_offsets, tables_blob);
diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
index 4cc3630..b183a8f 100644
--- a/include/hw/acpi/acpi-defs.h
+++ b/include/hw/acpi/acpi-defs.h
@@ -527,6 +527,15 @@ struct AcpiSratProcessorGiccAffinity
 
 typedef struct AcpiSratProcessorGiccAffinity AcpiSratProcessorGiccAffinity;
 
+/*
+ * SLIT (NUMA distance description) table
+ */
+struct AcpiSystemLocalityDistanceTable {
+    ACPI_TABLE_HEADER_DEF
+    uint64_t    nb_localities;
+} QEMU_PACKED;
+typedef struct AcpiSystemLocalityDistanceTable AcpiSystemLocalityDistanceTable;
+
 /* PCI fw r3.0 MCFG table. */
 /* Subtable */
 struct AcpiMcfgAllocation {
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 8f09dcf..b936eeb 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -21,6 +21,8 @@ typedef struct node_info {
     struct HostMemoryBackend *node_memdev;
     bool present;
     QLIST_HEAD(, numa_addr_range) addr; /* List to store address ranges */
+    uint8_t distance[MAX_NODES];
+    bool has_distance;
 } NodeInfo;
 
 extern NodeInfo numa_info[MAX_NODES];
diff --git a/numa.c b/numa.c
index 9f56be9..b4e11f3 100644
--- a/numa.c
+++ b/numa.c
@@ -50,6 +50,8 @@ static int have_memdevs = -1;
 static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
                              * For all nodes, nodeid < max_numa_nodeid
                              */
+static int min_numa_distance = 10;
+static int max_numa_distance = 255;
 int nb_numa_nodes;
 NodeInfo numa_info[MAX_NODES];
 
@@ -144,6 +146,8 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts 
*opts, Error **errp)
 {
     uint16_t nodenr;
     uint16List *cpus = NULL;
+    uint8List *distance = NULL;
+    int i;
 
     if (node->has_nodeid) {
         nodenr = node->nodeid;
@@ -208,6 +212,28 @@ static void numa_node_parse(NumaNodeOptions *node, 
QemuOpts *opts, Error **errp)
         numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
     }
+
+    if (node->has_distance) {
+        for (i = 0, distance = node->distance; distance; i++, distance = 
distance->next) {
+            if (distance->value >= max_numa_distance) {
+                error_setg(errp,
+                        "NUMA distance (%" PRIu8 ")"
+                        " should be smaller than maxnumadistance (%d)",
+                        distance->value, max_numa_distance);
+                return;
+            }
+            if (distance->value < min_numa_distance) {
+                error_setg(errp,
+                        "NUMA distance (%" PRIu8 ")"
+                        " should be larger than minnumadistance (%d)",
+                        distance->value, min_numa_distance);
+                return;
+            }
+            numa_info[nodenr].distance[i] = distance->value;
+        }
+        numa_info[nodenr].has_distance = true;
+    }
+
     numa_info[nodenr].present = true;
     max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
 }
@@ -294,6 +320,23 @@ static void validate_numa_cpus(void)
     g_free(seen_cpus);
 }
 
+static void validate_numa_distance(void)
+{
+    int i, j;
+
+    for (i = 0; i < nb_numa_nodes; i++) {
+        for (j = i; j < nb_numa_nodes; j++) {
+            if (i == j && numa_info[i].distance[j] != min_numa_distance) {
+                error_report("Local distance must be %d!", min_numa_distance);
+                exit(EXIT_FAILURE);
+            } else if (numa_info[i].distance[j] != numa_info[j].distance[i]) {
+                error_report("Unequal NUMA distance between nodes %d and %d!", 
i, j);
+                exit(EXIT_FAILURE);
+            }
+        }
+    }
+}
+
 void parse_numa_opts(MachineClass *mc)
 {
     int i;
@@ -390,6 +433,7 @@ void parse_numa_opts(MachineClass *mc)
         }
 
         validate_numa_cpus();
+        validate_numa_distance();
     } else {
         numa_set_mem_node_id(0, ram_size, 0);
     }
diff --git a/qapi-schema.json b/qapi-schema.json
index baa0d26..f2f2d05 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -5597,14 +5597,18 @@
 # @memdev: #optional memory backend object.  If specified for one node,
 #          it must be specified for all nodes.
 #
+# @distance: #optional NUMA distance array. The length of this array should
+#            be equal to number of NUMA nodes.
+#
 # Since: 2.1
 ##
 { 'struct': 'NumaNodeOptions',
   'data': {
-   '*nodeid': 'uint16',
-   '*cpus':   ['uint16'],
-   '*mem':    'size',
-   '*memdev': 'str' }}
+   '*nodeid':   'uint16',
+   '*cpus':     ['uint16'],
+   '*mem':      'size',
+   '*memdev':   'str' ,
+   '*distance': ['uint8'] }}
 
 ##
 # @HostMemPolicy:
-- 
2.7.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]