Commit a6030d7e authored by Reza Arbab's avatar Reza Arbab Committed by David Gibson
Browse files

spapr: Add a new level of NUMA for GPUs



NUMA nodes corresponding to GPU memory currently have the same
affinity/distance as normal memory nodes. Add a third NUMA associativity
reference point enabling us to give GPU nodes more distance.

This is guest visible information, which shouldn't change under a
running guest across migration between different qemu versions, so make
the change effective only in new (pseries > 5.0) machine types.

Before, `numactl -H` output in a guest with 4 GPUs (nodes 2-5):

node distances:
node   0   1   2   3   4   5
  0:  10  40  40  40  40  40
  1:  40  10  40  40  40  40
  2:  40  40  10  40  40  40
  3:  40  40  40  10  40  40
  4:  40  40  40  40  10  40
  5:  40  40  40  40  40  10

After:

node distances:
node   0   1   2   3   4   5
  0:  10  40  80  80  80  80
  1:  40  10  80  80  80  80
  2:  80  80  10  80  80  80
  3:  80  80  80  10  80  80
  4:  80  80  80  80  10  80
  5:  80  80  80  80  80  10

These are the same distances as on the host, mirroring the change made
to host firmware in skiboot commit f845a648b8cb ("numa/associativity:
Add a new level of NUMA for GPU's").

Signed-off-by: default avatarReza Arbab <arbab@linux.ibm.com>
Message-Id: <20200716225655.24289-1-arbab@linux.ibm.com>
Signed-off-by: default avatarDavid Gibson <david@gibson.dropbear.id.au>
parent a4beb5f5
Loading
Loading
Loading
Loading
+19 −2
Original line number Diff line number Diff line
@@ -890,10 +890,16 @@ static int spapr_dt_rng(void *fdt)
static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
{
    MachineState *ms = MACHINE(spapr);
    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
    int rtas;
    GString *hypertas = g_string_sized_new(256);
    GString *qemu_hypertas = g_string_sized_new(256);
    uint32_t refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4) };
    uint32_t refpoints[] = {
        cpu_to_be32(0x4),
        cpu_to_be32(0x4),
        cpu_to_be32(0x2),
    };
    uint32_t nr_refpoints = ARRAY_SIZE(refpoints);
    uint64_t max_device_addr = MACHINE(spapr)->device_memory->base +
        memory_region_size(&MACHINE(spapr)->device_memory->mr);
    uint32_t lrdr_capacity[] = {
@@ -945,8 +951,12 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
                     qemu_hypertas->str, qemu_hypertas->len));
    g_string_free(qemu_hypertas, TRUE);

    if (smc->pre_5_1_assoc_refpoints) {
        nr_refpoints = 2;
    }

    _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points",
                     refpoints, sizeof(refpoints)));
                     refpoints, nr_refpoints * sizeof(refpoints[0])));

    _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains",
                     maxdomains, sizeof(maxdomains)));
@@ -4584,9 +4594,16 @@ DEFINE_SPAPR_MACHINE(5_1, "5.1", true);
 */
static void spapr_machine_5_0_class_options(MachineClass *mc)
{
    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
    static GlobalProperty compat[] = {
        { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" },
    };

    spapr_machine_5_1_class_options(mc);
    compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len);
    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
    mc->numa_mem_supported = true;
    smc->pre_5_1_assoc_refpoints = true;
}

DEFINE_SPAPR_MACHINE(5_0, "5.0", false);
+2 −0
Original line number Diff line number Diff line
@@ -2089,6 +2089,8 @@ static Property spapr_phb_properties[] = {
                     pcie_ecs, true),
    DEFINE_PROP_UINT64("gpa", SpaprPhbState, nv2_gpa_win_addr, 0),
    DEFINE_PROP_UINT64("atsd", SpaprPhbState, nv2_atsd_win_addr, 0),
    DEFINE_PROP_BOOL("pre-5.1-associativity", SpaprPhbState,
                     pre_5_1_assoc, false),
    DEFINE_PROP_END_OF_LIST(),
};

+10 −3
Original line number Diff line number Diff line
@@ -362,9 +362,9 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
                                                    &error_abort);
        uint32_t associativity[] = {
            cpu_to_be32(0x4),
            SPAPR_GPU_NUMA_ID,
            SPAPR_GPU_NUMA_ID,
            SPAPR_GPU_NUMA_ID,
            cpu_to_be32(nvslot->numa_id),
            cpu_to_be32(nvslot->numa_id),
            cpu_to_be32(nvslot->numa_id),
            cpu_to_be32(nvslot->numa_id)
        };
        uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL);
@@ -375,6 +375,13 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
        _FDT(off);
        _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
        _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg))));

        if (sphb->pre_5_1_assoc) {
            associativity[1] = SPAPR_GPU_NUMA_ID;
            associativity[2] = SPAPR_GPU_NUMA_ID;
            associativity[3] = SPAPR_GPU_NUMA_ID;
        }

        _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
                          sizeof(associativity))));

+1 −0
Original line number Diff line number Diff line
@@ -94,6 +94,7 @@ struct SpaprPhbState {
    hwaddr nv2_gpa_win_addr;
    hwaddr nv2_atsd_win_addr;
    SpaprPhbPciNvGpuConfig *nvgpus;
    bool pre_5_1_assoc;
};

#define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
+1 −0
Original line number Diff line number Diff line
@@ -129,6 +129,7 @@ struct SpaprMachineClass {
    bool linux_pci_probe;
    bool smp_threads_vsmt; /* set VSMT to smp_threads by default */
    hwaddr rma_limit;          /* clamp the RMA to this size */
    bool pre_5_1_assoc_refpoints;

    void (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
                          uint64_t *buid, hwaddr *pio,