Commit 605ec166 authored by Yuval Shaia's avatar Yuval Shaia Committed by Marcel Apfelbaum
Browse files

hw/rdma: Add support for MAD packets

MAD (Management Datagram) packets are widely used by various modules
both in kernel and in user space for example the rdma_* API which is
used to create and maintain "connection" layer on top of RDMA uses
several types of MAD packets.

For more information please refer to chapter 13.4 in Volume 1
Architecture Specification, Release 1.1 available here:
https://www.infinibandta.org/ibta-specifications-download/



To support MAD packets the device uses an external utility
(contrib/rdmacm-mux) to relay packets from and to the guest driver.

Signed-off-by: default avatarYuval Shaia <yuval.shaia@oracle.com>
Reviewed-by: default avatarMarcel <Apfelbaum&lt;marcel.apfelbaum@gmail.com>
Signed-off-by: default avatarMarcel Apfelbaum <marcel.apfelbaum@gmail.com>
parent 305bdd7a
Loading
Loading
Loading
Loading
+243 −7
Original line number Diff line number Diff line
@@ -16,8 +16,13 @@
#include "qemu/osdep.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "qapi/qmp/qlist.h"
#include "qapi/qmp/qnum.h"

#include <infiniband/verbs.h>
#include <infiniband/umad_types.h>
#include <infiniband/umad.h>
#include <rdma/rdma_user_cm.h>

#include "trace.h"
#include "rdma_utils.h"
@@ -33,16 +38,25 @@
#define VENDOR_ERR_MAD_SEND         0x206
#define VENDOR_ERR_INVLKEY          0x207
#define VENDOR_ERR_MR_SMALL         0x208
#define VENDOR_ERR_INV_MAD_BUFF     0x209
#define VENDOR_ERR_INV_NUM_SGE      0x210

#define THR_NAME_LEN 16
#define THR_POLL_TO  5000

#define MAD_HDR_SIZE sizeof(struct ibv_grh)

typedef struct BackendCtx {
    uint64_t req_id;
    void *up_ctx;
    bool is_tx_req;
    struct ibv_sge sge; /* Used to save MAD recv buffer */
} BackendCtx;

struct backend_umad {
    struct ib_user_mad hdr;
    char mad[RDMA_MAX_PRIVATE_DATA];
};

static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx);

static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx)
@@ -286,6 +300,61 @@ static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res,
    return 0;
}

static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge,
                    uint32_t num_sge)
{
    struct backend_umad umad = {0};
    char *hdr, *msg;
    int ret;

    pr_dbg("num_sge=%d\n", num_sge);

    if (num_sge != 2) {
        return -EINVAL;
    }

    umad.hdr.length = sge[0].length + sge[1].length;
    pr_dbg("msg_len=%d\n", umad.hdr.length);

    if (umad.hdr.length > sizeof(umad.mad)) {
        return -ENOMEM;
    }

    umad.hdr.addr.qpn = htobe32(1);
    umad.hdr.addr.grh_present = 1;
    umad.hdr.addr.gid_index = backend_dev->backend_gid_idx;
    memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
    umad.hdr.addr.hop_limit = 0xFF;

    hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
    if (!hdr) {
        pr_dbg("Fail to map to sge[0]\n");
        return -ENOMEM;
    }
    msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
    if (!msg) {
        pr_dbg("Fail to map to sge[1]\n");
        rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
        return -ENOMEM;
    }

    pr_dbg_buf("mad_hdr", hdr, sge[0].length);
    pr_dbg_buf("mad_data", data, sge[1].length);

    memcpy(&umad.mad[0], hdr, sge[0].length);
    memcpy(&umad.mad[sge[0].length], msg, sge[1].length);

    rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length);
    rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);

    ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad,
                            sizeof(umad));

    pr_dbg("qemu_chr_fe_write=%d\n", ret);

    return (ret != sizeof(umad));
}

void rdma_backend_post_send(RdmaBackendDev *backend_dev,
                            RdmaBackendQP *qp, uint8_t qp_type,
                            struct ibv_sge *sge, uint32_t num_sge,
@@ -304,9 +373,13 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
            comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
        } else if (qp_type == IBV_QPT_GSI) {
            pr_dbg("QP1\n");
            rc = mad_send(backend_dev, sge, num_sge);
            if (rc) {
                comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
            } else {
                comp_handler(IBV_WC_SUCCESS, 0, ctx);
            }
        }
        pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type);
        return;
    }

@@ -370,6 +443,48 @@ out_free_bctx:
    g_free(bctx);
}

static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev,
                                         struct ibv_sge *sge, uint32_t num_sge,
                                         void *ctx)
{
    BackendCtx *bctx;
    int rc;
    uint32_t bctx_id;

    if (num_sge != 1) {
        pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge);
        return VENDOR_ERR_INV_NUM_SGE;
    }

    if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) {
        pr_dbg("Too small buffer for MAD\n");
        return VENDOR_ERR_INV_MAD_BUFF;
    }

    pr_dbg("addr=0x%" PRIx64"\n", sge[0].addr);
    pr_dbg("length=%d\n", sge[0].length);
    pr_dbg("lkey=%d\n", sge[0].lkey);

    bctx = g_malloc0(sizeof(*bctx));

    rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
    if (unlikely(rc)) {
        g_free(bctx);
        pr_dbg("Fail to allocate cqe_ctx\n");
        return VENDOR_ERR_NOMEM;
    }

    pr_dbg("bctx_id %d, bctx %p, ctx %p\n", bctx_id, bctx, ctx);
    bctx->up_ctx = ctx;
    bctx->sge = *sge;

    qemu_mutex_lock(&backend_dev->recv_mads_list.lock);
    qlist_append_int(backend_dev->recv_mads_list.list, bctx_id);
    qemu_mutex_unlock(&backend_dev->recv_mads_list.lock);

    return 0;
}

void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
                            RdmaDeviceResources *rdma_dev_res,
                            RdmaBackendQP *qp, uint8_t qp_type,
@@ -388,7 +503,10 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
        }
        if (qp_type == IBV_QPT_GSI) {
            pr_dbg("QP1\n");
            comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
            rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx);
            if (rc) {
                comp_handler(IBV_WC_GENERAL_ERR, rc, ctx);
            }
        }
        return;
    }
@@ -517,7 +635,6 @@ int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type,

    switch (qp_type) {
    case IBV_QPT_GSI:
        pr_dbg("QP1 unsupported\n");
        return 0;

    case IBV_QPT_RC:
@@ -748,11 +865,122 @@ static int init_device_caps(RdmaBackendDev *backend_dev,
    return 0;
}

static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid,
                                 union ibv_gid *my_gid, int paylen)
{
    grh->paylen = htons(paylen);
    grh->sgid = *sgid;
    grh->dgid = *my_gid;

    pr_dbg("paylen=%d (net=0x%x)\n", paylen, grh->paylen);
    pr_dbg("my_gid=0x%llx\n", my_gid->global.interface_id);
    pr_dbg("gid=0x%llx\n", sgid->global.interface_id);
}

static inline int mad_can_receieve(void *opaque)
{
    return sizeof(struct backend_umad);
}

static void mad_read(void *opaque, const uint8_t *buf, int size)
{
    RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque;
    QObject *o_ctx_id;
    unsigned long cqe_ctx_id;
    BackendCtx *bctx;
    char *mad;
    struct backend_umad *umad;

    assert(size != sizeof(umad));
    umad = (struct backend_umad *)buf;

    pr_dbg("Got %d bytes\n", size);
    pr_dbg("umad->hdr.length=%d\n", umad->hdr.length);

#ifdef PVRDMA_DEBUG
    struct umad_hdr *hdr = (struct umad_hdr *)&msg->umad.mad;
    pr_dbg("bv %x cls %x cv %x mtd %x st %d tid %" PRIx64 " at %x atm %x\n",
           hdr->base_version, hdr->mgmt_class, hdr->class_version,
           hdr->method, hdr->status, be64toh(hdr->tid),
           hdr->attr_id, hdr->attr_mod);
#endif

    qemu_mutex_lock(&backend_dev->recv_mads_list.lock);
    o_ctx_id = qlist_pop(backend_dev->recv_mads_list.list);
    qemu_mutex_unlock(&backend_dev->recv_mads_list.lock);
    if (!o_ctx_id) {
        pr_dbg("No more free MADs buffers, waiting for a while\n");
        sleep(THR_POLL_TO);
        return;
    }

    cqe_ctx_id = qnum_get_uint(qobject_to(QNum, o_ctx_id));
    bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
    if (unlikely(!bctx)) {
        pr_dbg("Error: Fail to find ctx for %ld\n", cqe_ctx_id);
        return;
    }

    pr_dbg("id %ld, bctx %p, ctx %p\n", cqe_ctx_id, bctx, bctx->up_ctx);

    mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr,
                           bctx->sge.length);
    if (!mad || bctx->sge.length < umad->hdr.length + MAD_HDR_SIZE) {
        comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF,
                     bctx->up_ctx);
    } else {
        memset(mad, 0, bctx->sge.length);
        build_mad_hdr((struct ibv_grh *)mad,
                      (union ibv_gid *)&umad->hdr.addr.gid,
                      &backend_dev->gid, umad->hdr.length);
        memcpy(&mad[MAD_HDR_SIZE], umad->mad, umad->hdr.length);
        rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length);

        comp_handler(IBV_WC_SUCCESS, 0, bctx->up_ctx);
    }

    g_free(bctx);
    rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
}

static int mad_init(RdmaBackendDev *backend_dev)
{
    struct backend_umad umad = {0};
    int ret;

    if (!qemu_chr_fe_backend_connected(backend_dev->mad_chr_be)) {
        pr_dbg("Missing chardev for MAD multiplexer\n");
        return -EIO;
    }

    qemu_chr_fe_set_handlers(backend_dev->mad_chr_be, mad_can_receieve,
                             mad_read, NULL, NULL, backend_dev, NULL, true);

    /* Register ourself */
    memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
    ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad,
                            sizeof(umad.hdr));
    if (ret != sizeof(umad.hdr)) {
        pr_dbg("Fail to register to rdma_umadmux (%d)\n", ret);
    }

    qemu_mutex_init(&backend_dev->recv_mads_list.lock);
    backend_dev->recv_mads_list.list = qlist_new();

    return 0;
}

static void mad_fini(RdmaBackendDev *backend_dev)
{
    qlist_destroy_obj(QOBJECT(backend_dev->recv_mads_list.list));
    qemu_mutex_destroy(&backend_dev->recv_mads_list.lock);
}

int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
                      RdmaDeviceResources *rdma_dev_res,
                      const char *backend_device_name, uint8_t port_num,
                      uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
                      Error **errp)
                      CharBackend *mad_chr_be, Error **errp)
{
    int i;
    int ret = 0;
@@ -763,7 +991,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
    memset(backend_dev, 0, sizeof(*backend_dev));

    backend_dev->dev = pdev;

    backend_dev->mad_chr_be = mad_chr_be;
    backend_dev->backend_gid_idx = backend_gid_idx;
    backend_dev->port_num = port_num;
    backend_dev->rdma_dev_res = rdma_dev_res;
@@ -854,6 +1082,13 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
    pr_dbg("interface_id=0x%" PRIx64 "\n",
           be64_to_cpu(backend_dev->gid.global.interface_id));

    ret = mad_init(backend_dev);
    if (ret) {
        error_setg(errp, "Fail to initialize mad");
        ret = -EIO;
        goto out_destroy_comm_channel;
    }

    backend_dev->comp_thread.run = false;
    backend_dev->comp_thread.is_running = false;

@@ -890,6 +1125,7 @@ void rdma_backend_stop(RdmaBackendDev *backend_dev)
void rdma_backend_fini(RdmaBackendDev *backend_dev)
{
    rdma_backend_stop(backend_dev);
    mad_fini(backend_dev);
    g_hash_table_destroy(ah_hash);
    ibv_destroy_comp_channel(backend_dev->channel);
    ibv_close_device(backend_dev->context);
+3 −1
Original line number Diff line number Diff line
@@ -17,6 +17,8 @@
#define RDMA_BACKEND_H

#include "qapi/error.h"
#include "chardev/char-fe.h"

#include "rdma_rm_defs.h"
#include "rdma_backend_defs.h"

@@ -50,7 +52,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
                      RdmaDeviceResources *rdma_dev_res,
                      const char *backend_device_name, uint8_t port_num,
                      uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
                      Error **errp);
                      CharBackend *mad_chr_be, Error **errp);
void rdma_backend_fini(RdmaBackendDev *backend_dev);
void rdma_backend_start(RdmaBackendDev *backend_dev);
void rdma_backend_stop(RdmaBackendDev *backend_dev);
+9 −1
Original line number Diff line number Diff line
@@ -16,8 +16,9 @@
#ifndef RDMA_BACKEND_DEFS_H
#define RDMA_BACKEND_DEFS_H

#include <infiniband/verbs.h>
#include "qemu/thread.h"
#include "chardev/char-fe.h"
#include <infiniband/verbs.h>

typedef struct RdmaDeviceResources RdmaDeviceResources;

@@ -28,6 +29,11 @@ typedef struct RdmaBackendThread {
    bool is_running; /* Set by the thread to report its status */
} RdmaBackendThread;

typedef struct RecvMadList {
    QemuMutex lock;
    QList *list;
} RecvMadList;

typedef struct RdmaBackendDev {
    struct ibv_device_attr dev_attr;
    RdmaBackendThread comp_thread;
@@ -39,6 +45,8 @@ typedef struct RdmaBackendDev {
    struct ibv_comp_channel *channel;
    uint8_t port_num;
    uint8_t backend_gid_idx;
    RecvMadList recv_mads_list;
    CharBackend *mad_chr_be;
} RdmaBackendDev;

typedef struct RdmaBackendPD {
+2 −0
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@
#include "qemu/units.h"
#include "hw/pci/pci.h"
#include "hw/pci/msix.h"
#include "chardev/char-fe.h"

#include "../rdma_backend_defs.h"
#include "../rdma_rm_defs.h"
@@ -83,6 +84,7 @@ typedef struct PVRDMADev {
    uint8_t backend_port_num;
    RdmaBackendDev backend_dev;
    RdmaDeviceResources rdma_dev_res;
    CharBackend mad_chr;
} PVRDMADev;
#define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)

+3 −1
Original line number Diff line number Diff line
@@ -51,6 +51,7 @@ static Property pvrdma_dev_properties[] = {
    DEFINE_PROP_INT32("dev-caps-max-qp-init-rd-atom", PVRDMADev,
                      dev_attr.max_qp_init_rd_atom, MAX_QP_INIT_RD_ATOM),
    DEFINE_PROP_INT32("dev-caps-max-ah", PVRDMADev, dev_attr.max_ah, MAX_AH),
    DEFINE_PROP_CHR("mad-chardev", PVRDMADev, mad_chr),
    DEFINE_PROP_END_OF_LIST(),
};

@@ -613,7 +614,8 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)

    rc = rdma_backend_init(&dev->backend_dev, pdev, &dev->rdma_dev_res,
                           dev->backend_device_name, dev->backend_port_num,
                           dev->backend_gid_idx, &dev->dev_attr, errp);
                           dev->backend_gid_idx, &dev->dev_attr, &dev->mad_chr,
                           errp);
    if (rc) {
        goto out;
    }