Commit 333ec4ca authored by Peter Maydell's avatar Peter Maydell
Browse files

Merge remote-tracking branch 'remotes/jasowang/tags/net-pull-request' into staging



# gpg: Signature made Tue 27 Sep 2016 11:05:56 BST
# gpg:                using RSA key 0xEF04965B398D6211
# gpg: Good signature from "Jason Wang (Jason Wang on RedHat) <jasowang@redhat.com>"
# gpg: WARNING: This key is not certified with sufficiently trusted signatures!
# gpg:          It is not certain that the signature belongs to the owner.
# Primary key fingerprint: 215D 46F4 8246 689E C77F  3562 EF04 965B 398D 6211

* remotes/jasowang/tags/net-pull-request: (27 commits)
  imx_fec: fix error in qemu_send_packet argument
  mcf_fec: fix error in qemu_send_packet argument
  net: mcf: limit buffer descriptor count
  e1000e: Fix EIAC register implementation
  e1000e: Fix spurious RX TCP ACK interrupts
  e1000e: Fix OTHER interrupts processing for MSI-X
  e1000e: Fix PBACLR implementation
  e1000e: Fix CTRL_EXT.EIAME behavior
  e1000e: Flush receive queues on link up
  e1000e: Flush all receive queues on receive enable
  net: limit allocation in nc_sendv_compat
  tap: Allow specifying a bridge
  e1000: fix buliding complaint
  docs: Add documentation for COLO-proxy
  MAINTAINERS: add maintainer for COLO-proxy
  filter-rewriter: rewrite tcp packet to keep secondary connection
  filter-rewriter: track connection and parse packet
  filter-rewriter: introduce filter-rewriter initialization
  colo-compare: add TCP, UDP, ICMP packet comparison
  colo-compare: introduce packet comparison thread
  ...

Signed-off-by: default avatarPeter Maydell <peter.maydell@linaro.org>
parents 7cfdc02d fa26f018
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -1364,6 +1364,15 @@ F: util/uuid.c
F: include/qemu/uuid.h
F: tests/test-uuid.c

COLO Proxy
M: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
M: Li Zhijian <lizhijian@cn.fujitsu.com>
S: Supported
F: docs/colo-proxy.txt
F: net/colo*
F: net/filter-rewriter.c
F: net/filter-mirror.c

Usermode Emulation
------------------
Overall

docs/colo-proxy.txt

0 → 100644
+188 −0
Original line number Diff line number Diff line
COLO-proxy
----------
Copyright (c) 2016 Intel Corporation
Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
Copyright (c) 2016 Fujitsu, Corp.

This work is licensed under the terms of the GNU GPL, version 2 or later.
See the COPYING file in the top-level directory.

This document gives an overview of COLO proxy's design.

== Background ==
COLO-proxy is a part of COLO project. It is used
to compare the network package to help COLO decide
whether to do checkpoint. With COLO-proxy's help,
COLO greatly improves the performance.

The filter-redirector, filter-mirror, colo-compare
and filter-rewriter compose the COLO-proxy.

== Architecture ==

COLO-Proxy is based on qemu netfilter and it's a plugin for qemu netfilter
(except colo-compare). It keep Secondary VM connect normally to
client and compare packets sent by PVM with sent by SVM.
If the packet difference, notify COLO-frame to do checkpoint and send
all primary packet has queued. Otherwise just send the queued primary
packet and drop the queued secondary packet.

Below is a COLO proxy ascii figure:

 Primary qemu                                                           Secondary qemu
+--------------------------------------------------------------+       +----------------------------------------------------------------+
| +----------------------------------------------------------+ |       |  +-----------------------------------------------------------+ |
| |                                                          | |       |  |                                                           | |
| |                        guest                             | |       |  |                        guest                              | |
| |                                                          | |       |  |                                                           | |
| +-------^--------------------------+-----------------------+ |       |  +---------------------+--------+----------------------------+ |
|         |                          |                         |       |                        ^        |                              |
|         |                          |                         |       |                        |        |                              |
|         |  +------------------------------------------------------+  |                        |        |                              |
|netfilter|  |                       |                         |    |  |   netfilter            |        |                              |
| +----------+ +----------------------------+                  |    |  |  +-----------------------------------------------------------+ |
| |       |  |                       |      |        out       |    |  |  |                     |        |  filter excute order       | |
| |       |  |          +-----------------------------+        |    |  |  |                     |        | +------------------->      | |
| |       |  |          |            |      |         |        |    |  |  |                     |        |   TCP                      | |
| | +-----+--+-+  +-----v----+ +-----v----+ |pri +----+----+sec|    |  |  | +------------+  +---+----+---v+rewriter++  +------------+ | |
| | |          |  |          | |          | |in  |         |in |    |  |  | |            |  |        |              |  |            | | |
| | |  filter  |  |  filter  | |  filter  +------>  colo   <------+ +-------->  filter   +--> adjust |   adjust     +-->   filter   | | |
| | |  mirror  |  |redirector| |redirector| |    | compare |   |  |    |  | | redirector |  | ack    |   seq        |  | redirector | | |
| | |          |  |          | |          | |    |         |   |  |    |  | |            |  |        |              |  |            | | |
| | +----^-----+  +----+-----+ +----------+ |    +---------+   |  |    |  | +------------+  +--------+--------------+  +---+--------+ | |
| |      |   tx        |   rx           rx  |                  |  |    |  |            tx                        all       |  rx      | |
| |      |             |                    |                  |  |    |  +-----------------------------------------------------------+ |
| |      |             +--------------+     |                  |  |    |                                                   |            |
| |      |   filter excute order      |     |                  |  |    |                                                   |            |
| |      |  +---------------->        |     |                  |  +--------------------------------------------------------+            |
| +-----------------------------------------+                  |       |                                                                |
|        |                            |                        |       |                                                                |
+--------------------------------------------------------------+       +----------------------------------------------------------------+
         |guest receive               | guest send
         |                            |
+--------+----------------------------v------------------------+
|                                                              |                          NOTE: filter direction is rx/tx/all
|                         tap                                  |                          rx:receive packets sent to the netdev
|                                                              |                          tx:receive packets sent by the netdev
+--------------------------------------------------------------+

1.Guest receive packet route:

Primary:

Tap --> Mirror Client Filter
Mirror client will send packet to guest,at the
same time, copy and forward packet to secondary
mirror server.

Secondary:

Mirror Server Filter --> TCP Rewriter
If receive packet is TCP packet,we will adjust ack
and update TCP checksum, then send to secondary
guest. Otherwise directly send to guest.

2.Guest send packet route:

Primary:

Guest --> Redirect Server Filter
Redirect server filter receive primary guest packet
but do nothing, just pass to next filter.

Redirect Server Filter --> COLO-Compare
COLO-compare receive primary guest packet then
waiting scondary redirect packet to compare it.
If packet same,send queued primary packet and clear
queued secondary packet, Otherwise send primary packet
and do checkpoint.

COLO-Compare --> Another Redirector Filter
The redirector get packet from colo-compare by use
chardev socket.

Redirector Filter --> Tap
Send the packet.

Secondary:

Guest --> TCP Rewriter Filter
If the packet is TCP packet,we will adjust seq
and update TCP checksum. Then send it to
redirect client filter. Otherwise directly send to
redirect client filter.

Redirect Client Filter --> Redirect Server Filter
Forward packet to primary.

== Components introduction ==

Filter-mirror is a netfilter plugin.
It gives qemu the ability to mirror
packets to a chardev.

Filter-redirector is a netfilter plugin.
It gives qemu the ability to redirect net packet.
Redirector can redirect filter's net packet to outdev,
and redirect indev's packet to filter.

                    filter
                      +
          redirector  |
             +--------------+
             |        |     |
             |        |     |
             |        |     |
  indev +---------+   +---------->  outdev
             |    |         |
             |    |         |
             |    |         |
             +--------------+
                  |
                  v
                filter

COLO-compare, we do packet comparing job.
Packets coming from the primary char indev will be sent to outdev.
Packets coming from the secondary char dev will be dropped after comparing.
COLO-comapre need two input chardev and one output chardev:
primary_in=chardev1-id (source: primary send packet)
secondary_in=chardev2-id (source: secondary send packet)
outdev=chardev3-id

Filter-rewriter will rewrite some of secondary packet to make
secondary guest's tcp connection established successfully.
In this module we will rewrite tcp packet's ack to the secondary
from primary,and rewrite tcp packet's seq to the primary from
secondary.

== Usage ==

Here, we use demo ip and port discribe more clearly.
Primary(ip:3.3.3.3):
-netdev tap,id=hn0,vhost=off,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown
-device e1000,id=e0,netdev=hn0,mac=52:a4:00:12:78:66
-chardev socket,id=mirror0,host=3.3.3.3,port=9003,server,nowait
-chardev socket,id=compare1,host=3.3.3.3,port=9004,server,nowait
-chardev socket,id=compare0,host=3.3.3.3,port=9001,server,nowait
-chardev socket,id=compare0-0,host=3.3.3.3,port=9001
-chardev socket,id=compare_out,host=3.3.3.3,port=9005,server,nowait
-chardev socket,id=compare_out0,host=3.3.3.3,port=9005
-object filter-mirror,id=m0,netdev=hn0,queue=tx,outdev=mirror0
-object filter-redirector,netdev=hn0,id=redire0,queue=rx,indev=compare_out
-object filter-redirector,netdev=hn0,id=redire1,queue=rx,outdev=compare0
-object colo-compare,id=comp0,primary_in=compare0-0,secondary_in=compare1,outdev=compare_out0

Secondary(ip:3.3.3.8):
-netdev tap,id=hn0,vhost=off,script=/etc/qemu-ifup,down script=/etc/qemu-ifdown
-device e1000,netdev=hn0,mac=52:a4:00:12:78:66
-chardev socket,id=red0,host=3.3.3.3,port=9003
-chardev socket,id=red1,host=3.3.3.3,port=9004
-object filter-redirector,id=f1,netdev=hn0,queue=tx,indev=red0
-object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1

Note:
  a.COLO-proxy must work with COLO-frame and Block-replication.
  b.Primary COLO must be started firstly, because COLO-proxy needs
    chardev socket server running before secondary started.
  c.Filter-rewriter only rewrite tcp packet.
+1 −1
Original line number Diff line number Diff line
@@ -400,7 +400,7 @@ static void e1000e_write_config(PCIDevice *pci_dev, uint32_t address,

    if (range_covers_byte(address, len, PCI_COMMAND) &&
        (pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
        qemu_flush_queued_packets(qemu_get_queue(s->nic));
        e1000e_start_recv(&s->core);
    }
}

+20 −12
Original line number Diff line number Diff line
@@ -953,7 +953,7 @@ e1000e_has_rxbufs(E1000ECore *core, const E1000E_RingInfo *r,
                         core->rx_desc_buf_size;
}

static inline void
void
e1000e_start_recv(E1000ECore *core)
{
    int i;
@@ -1710,7 +1710,8 @@ e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt)
        }

        /* Perform ACK receive detection */
        if (e1000e_is_tcp_ack(core, core->rx_pkt)) {
        if  (!(core->mac[RFCTL] & E1000_RFCTL_ACK_DIS) &&
             (e1000e_is_tcp_ack(core, core->rx_pkt))) {
            n |= E1000_ICS_ACK;
        }

@@ -1807,6 +1808,7 @@ e1000e_core_set_link_status(E1000ECore *core)
                                   core->autoneg_timer);
        } else {
            e1000x_update_regs_on_link_up(core->mac, core->phy[0]);
            e1000e_start_recv(core);
        }
    }

@@ -2007,19 +2009,23 @@ e1000e_msix_notify_one(E1000ECore *core, uint32_t cause, uint32_t int_cfg)
    }

    if (core->mac[CTRL_EXT] & E1000_CTRL_EXT_EIAME) {
        trace_e1000e_irq_ims_clear_eiame(core->mac[IAM], cause);
        e1000e_clear_ims_bits(core, core->mac[IAM] & cause);
        trace_e1000e_irq_iam_clear_eiame(core->mac[IAM], cause);
        core->mac[IAM] &= ~cause;
    }

    trace_e1000e_irq_icr_clear_eiac(core->mac[ICR], core->mac[EIAC]);

    if (core->mac[EIAC] & E1000_ICR_OTHER) {
        effective_eiac = (core->mac[EIAC] & E1000_EIAC_MASK) |
                         E1000_ICR_OTHER_CAUSES;
    } else {
        effective_eiac = core->mac[EIAC] & E1000_EIAC_MASK;
    effective_eiac = core->mac[EIAC] & cause;

    if (effective_eiac == E1000_ICR_OTHER) {
        effective_eiac |= E1000_ICR_OTHER_CAUSES;
    }

    core->mac[ICR] &= ~effective_eiac;

    if (!(core->mac[CTRL_EXT] & E1000_CTRL_EXT_IAME)) {
        core->mac[IMS] &= ~effective_eiac;
    }
}

static void
@@ -2130,7 +2136,7 @@ e1000e_update_interrupt_state(E1000ECore *core)

    /* Set ICR[OTHER] for MSI-X */
    if (is_msix) {
        if (core->mac[ICR] & core->mac[IMS] & E1000_ICR_OTHER_CAUSES) {
        if (core->mac[ICR] & E1000_ICR_OTHER_CAUSES) {
            core->mac[ICR] |= E1000_ICR_OTHER;
            trace_e1000e_irq_add_msi_other(core->mac[ICR]);
        }
@@ -2168,7 +2174,7 @@ e1000e_update_interrupt_state(E1000ECore *core)
    }
}

static inline void
static void
e1000e_set_interrupt_cause(E1000ECore *core, uint32_t val)
{
    trace_e1000e_irq_set_cause_entry(val, core->mac[ICR]);
@@ -2187,6 +2193,8 @@ e1000e_autoneg_timer(void *opaque)
    E1000ECore *core = opaque;
    if (!qemu_get_queue(core->owner_nic)->link_down) {
        e1000x_update_regs_on_autoneg_done(core->mac, core->phy[0]);
        e1000e_start_recv(core);

        e1000e_update_flowctl_status(core);
        /* signal link status change to the guest */
        e1000e_set_interrupt_cause(core, E1000_ICR_LSC);
@@ -2344,7 +2352,7 @@ e1000e_set_pbaclr(E1000ECore *core, int index, uint32_t val)

    core->mac[PBACLR] = val & E1000_PBACLR_VALID_MASK;

    if (msix_enabled(core->owner)) {
    if (!msix_enabled(core->owner)) {
        return;
    }

+3 −0
Original line number Diff line number Diff line
@@ -144,3 +144,6 @@ e1000e_receive(E1000ECore *core, const uint8_t *buf, size_t size);

ssize_t
e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt);

void
e1000e_start_recv(E1000ECore *core);
Loading