Commit c8090972 authored by Peter Maydell's avatar Peter Maydell
Browse files

Merge remote-tracking branch 'remotes/marcel/tags/rdma-pull-request' into staging



RDMA queue

# gpg: Signature made Sat 18 Aug 2018 16:01:46 BST
# gpg:                using RSA key 36D4C0F0CF2FE46D
# gpg: Good signature from "Marcel Apfelbaum <marcel.apfelbaum@zoho.com>"
# gpg:                 aka "Marcel Apfelbaum <marcel@redhat.com>"
# gpg:                 aka "Marcel Apfelbaum <marcel.apfelbaum@gmail.com>"
# gpg: WARNING: This key is not certified with sufficiently trusted signatures!
# gpg:          It is not certain that the signature belongs to the owner.
# Primary key fingerprint: B1C6 3A57 F92E 08F2 640F  31F5 36D4 C0F0 CF2F E46D

* remotes/marcel/tags/rdma-pull-request:
  config: split PVRDMA from RDMA
  hw/pvrdma: remove not needed include
  hw/rdma: Add reference to pci_dev in backend_dev
  hw/rdma: Bugfix - Support non-aligned buffers
  hw/rdma: Print backend QP number in hex format
  hw/rdma: Cosmetic change - move to generic function
  hw/pvrdma: Cosmetic change - indent right
  hw/rdma: Reorder resource cleanup
  hw/rdma: Do not allocate memory for non-dma MR
  hw/rdma: Delete useless structure RdmaRmUserMR
  hw/pvrdma: Make default pkey 0xFFFF
  hw/pvrdma: Clean CQE before use
  hw/rdma: Modify debug macros
  hw/pvrdma: Bugfix - provide the correct attr_mask to query_qp
  hw/rdma: Make distinction between device init and start modes

Signed-off-by: default avatarPeter Maydell <peter.maydell@linaro.org>
parents 62c34848 21ab34c9
Loading
Loading
Loading
Loading
+54 −1
Original line number Diff line number Diff line
@@ -375,6 +375,7 @@ hax="no"
hvf="no"
whpx="no"
rdma=""
pvrdma=""
gprof="no"
debug_tcg="no"
debug="no"
@@ -1363,6 +1364,10 @@ for opt do
  ;;
  --disable-rdma) rdma="no"
  ;;
  --enable-pvrdma) pvrdma="yes"
  ;;
  --disable-pvrdma) pvrdma="no"
  ;;
  --with-gtkabi=*) gtkabi="$optarg"
  ;;
  --disable-vte) vte="no"
@@ -1669,7 +1674,8 @@ disabled with --disable-FEATURE, default is enabled if available:
  hax             HAX acceleration support
  hvf             Hypervisor.framework acceleration support
  whpx            Windows Hypervisor Platform acceleration support
  rdma            Enable RDMA-based migration and PVRDMA support
  rdma            Enable RDMA-based migration
  pvrdma          Enable PVRDMA support
  vde             support for vde network
  netmap          support for netmap network
  linux-aio       Linux AIO support
@@ -3064,6 +3070,48 @@ EOF
  fi
fi

##########################################
# PVRDMA detection

cat > $TMPC <<EOF &&
#include <sys/mman.h>

int
main(void)
{
    char buf = 0;
    void *addr = &buf;
    addr = mremap(addr, 0, 1, MREMAP_MAYMOVE | MREMAP_FIXED);

    return 0;
}
EOF

if test "$rdma" = "yes" ; then
    case "$pvrdma" in
    "")
        if compile_prog "" ""; then
            pvrdma="yes"
        else
            pvrdma="no"
        fi
        ;;
    "yes")
        if ! compile_prog "" ""; then
            error_exit "PVRDMA is not supported since mremap is not implemented"
        fi
        pvrdma="yes"
        ;;
    "no")
        pvrdma="no"
        ;;
    esac
else
    if test "$pvrdma" = "yes" ; then
        error_exit "PVRDMA requires rdma suppport"
    fi
    pvrdma="no"
fi

##########################################
# VNC SASL detection
@@ -5952,6 +6000,7 @@ if test "$tcg" = "yes" ; then
fi
echo "malloc trim support $malloc_trim"
echo "RDMA support      $rdma"
echo "PVRDMA support    $pvrdma"
echo "fdt support       $fdt"
echo "membarrier        $membarrier"
echo "preadv support    $preadv"
@@ -6708,6 +6757,10 @@ if test "$rdma" = "yes" ; then
  echo "RDMA_LIBS=$rdma_libs" >> $config_host_mak
fi

if test "$pvrdma" = "yes" ; then
  echo "CONFIG_PVRDMA=y" >> $config_host_mak
fi

if test "$have_rtnetlink" = "yes" ; then
  echo "CONFIG_RTNETLINK=y" >> $config_host_mak
fi
+1 −1
Original line number Diff line number Diff line
ifeq ($(CONFIG_RDMA),y)
ifeq ($(CONFIG_PVRDMA),y)
obj-$(CONFIG_PCI) += rdma_utils.o rdma_backend.o rdma_rm.o
obj-$(CONFIG_PCI) += vmw/pvrdma_dev_ring.o vmw/pvrdma_cmd.o \
                     vmw/pvrdma_qp_ops.o vmw/pvrdma_main.o
+83 −22
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@
#define VENDOR_ERR_MR_SMALL         0x208

#define THR_NAME_LEN 16
#define THR_POLL_TO  5000

typedef struct BackendCtx {
    uint64_t req_id;
@@ -91,10 +92,30 @@ static void *comp_handler_thread(void *arg)
    int rc;
    struct ibv_cq *ev_cq;
    void *ev_ctx;
    int flags;
    GPollFD pfds[1];

    /* Change to non-blocking mode */
    flags = fcntl(backend_dev->channel->fd, F_GETFL);
    rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK);
    if (rc < 0) {
        pr_dbg("Fail to change to non-blocking mode\n");
        return NULL;
    }

    pr_dbg("Starting\n");

    pfds[0].fd = backend_dev->channel->fd;
    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;

    backend_dev->comp_thread.is_running = true;

    while (backend_dev->comp_thread.run) {
        do {
            rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS);
        } while (!rc && backend_dev->comp_thread.run);

        if (backend_dev->comp_thread.run) {
            pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel);
            rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
            pr_dbg("ibv_get_cq_event=%d\n", rc);
@@ -112,14 +133,41 @@ static void *comp_handler_thread(void *arg)

            ibv_ack_cq_events(ev_cq, 1);
        }
    }

    pr_dbg("Going down\n");

    /* TODO: Post cqe for all remaining buffs that were posted */

    backend_dev->comp_thread.is_running = false;

    qemu_thread_exit(0);

    return NULL;
}

static void stop_backend_thread(RdmaBackendThread *thread)
{
    thread->run = false;
    while (thread->is_running) {
        pr_dbg("Waiting for thread to complete\n");
        sleep(THR_POLL_TO / SCALE_US / 2);
    }
}

static void start_comp_thread(RdmaBackendDev *backend_dev)
{
    char thread_name[THR_NAME_LEN] = {0};

    stop_backend_thread(&backend_dev->comp_thread);

    snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
             ibv_get_device_name(backend_dev->ib_dev));
    backend_dev->comp_thread.run = true;
    qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
                       comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
}

void rdma_backend_register_comp_handler(void (*handler)(int status,
                                        unsigned int vendor_err, void *ctx))
{
@@ -223,8 +271,7 @@ static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res,
            return VENDOR_ERR_INVLKEY | ssge[ssge_idx].lkey;
        }

        dsge->addr = (uintptr_t)mr->user_mr.host_virt + ssge[ssge_idx].addr -
                     mr->user_mr.guest_start;
        dsge->addr = (uintptr_t)mr->virt + ssge[ssge_idx].addr - mr->start;
        dsge->length = ssge[ssge_idx].length;
        dsge->lkey = rdma_backend_mr_lkey(&mr->backend_mr);

@@ -697,7 +744,7 @@ static int init_device_caps(RdmaBackendDev *backend_dev,
    return 0;
}

int rdma_backend_init(RdmaBackendDev *backend_dev,
int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
                      RdmaDeviceResources *rdma_dev_res,
                      const char *backend_device_name, uint8_t port_num,
                      uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
@@ -706,10 +753,13 @@ int rdma_backend_init(RdmaBackendDev *backend_dev,
    int i;
    int ret = 0;
    int num_ibv_devices;
    char thread_name[THR_NAME_LEN] = {0};
    struct ibv_device **dev_list;
    struct ibv_port_attr port_attr;

    memset(backend_dev, 0, sizeof(*backend_dev));

    backend_dev->dev = pdev;

    backend_dev->backend_gid_idx = backend_gid_idx;
    backend_dev->port_num = port_num;
    backend_dev->rdma_dev_res = rdma_dev_res;
@@ -800,11 +850,8 @@ int rdma_backend_init(RdmaBackendDev *backend_dev,
    pr_dbg("interface_id=0x%" PRIx64 "\n",
           be64_to_cpu(backend_dev->gid.global.interface_id));

    snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
             ibv_get_device_name(backend_dev->ib_dev));
    backend_dev->comp_thread.run = true;
    qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
                       comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
    backend_dev->comp_thread.run = false;
    backend_dev->comp_thread.is_running = false;

    ah_cache_init();

@@ -823,8 +870,22 @@ out:
    return ret;
}


void rdma_backend_start(RdmaBackendDev *backend_dev)
{
    pr_dbg("Starting rdma_backend\n");
    start_comp_thread(backend_dev);
}

void rdma_backend_stop(RdmaBackendDev *backend_dev)
{
    pr_dbg("Stopping rdma_backend\n");
    stop_backend_thread(&backend_dev->comp_thread);
}

void rdma_backend_fini(RdmaBackendDev *backend_dev)
{
    rdma_backend_stop(backend_dev);
    g_hash_table_destroy(ah_hash);
    ibv_destroy_comp_channel(backend_dev->channel);
    ibv_close_device(backend_dev->context);
+3 −1
Original line number Diff line number Diff line
@@ -46,12 +46,14 @@ static inline uint32_t rdma_backend_mr_rkey(const RdmaBackendMR *mr)
    return mr->ibmr ? mr->ibmr->rkey : 0;
}

int rdma_backend_init(RdmaBackendDev *backend_dev,
int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
                      RdmaDeviceResources *rdma_dev_res,
                      const char *backend_device_name, uint8_t port_num,
                      uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
                      Error **errp);
void rdma_backend_fini(RdmaBackendDev *backend_dev);
void rdma_backend_start(RdmaBackendDev *backend_dev);
void rdma_backend_stop(RdmaBackendDev *backend_dev);
void rdma_backend_register_comp_handler(void (*handler)(int status,
                                        unsigned int vendor_err, void *ctx));
void rdma_backend_unregister_comp_handler(void);
+2 −1
Original line number Diff line number Diff line
@@ -24,7 +24,8 @@ typedef struct RdmaDeviceResources RdmaDeviceResources;
typedef struct RdmaBackendThread {
    QemuThread thread;
    QemuMutex mutex;
    bool run;
    bool run; /* Set by thread manager to let thread know it should exit */
    bool is_running; /* Set by the thread to report its status */
} RdmaBackendThread;

typedef struct RdmaBackendDev {
Loading