Commit 257367c0 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files
Alexei Starovoitov says:

====================
pull-request: bpf-next 2022-01-06

We've added 41 non-merge commits during the last 2 day(s) which contain
a total of 36 files changed, 1214 insertions(+), 368 deletions(-).

The main changes are:

1) Various fixes in the verifier, from Kris and Daniel.

2) Fixes in sockmap, from John.

3) bpf_getsockopt fix, from Kuniyuki.

4) INET_POST_BIND fix, from Menglong.

5) arm64 JIT fix for bpf pseudo funcs, from Hou.

6) BPF ISA doc improvements, from Christoph.

* https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (41 commits)
  bpf: selftests: Add bind retry for post_bind{4, 6}
  bpf: selftests: Use C99 initializers in test_sock.c
  net: bpf: Handle return value of BPF_CGROUP_RUN_PROG_INET{4,6}_POST_BIND()
  bpf/selftests: Test bpf_d_path on rdonly_mem.
  libbpf: Add documentation for bpf_map batch operations
  selftests/bpf: Don't rely on preserving volatile in PT_REGS macros in loop3
  xdp: Add xdp_do_redirect_frame() for pre-computed xdp_frames
  xdp: Move conversion to xdp_frame out of map functions
  page_pool: Store the XDP mem id
  page_pool: Add callback to init pages when they are allocated
  xdp: Allow registering memory model without rxq reference
  samples/bpf: xdpsock: Add timestamp for Tx-only operation
  samples/bpf: xdpsock: Add time-out for cleaning Tx
  samples/bpf: xdpsock: Add sched policy and priority support
  samples/bpf: xdpsock: Add cyclic TX operation capability
  samples/bpf: xdpsock: Add clockid selection support
  samples/bpf: xdpsock: Add Dest and Src MAC setting for Tx-only operation
  samples/bpf: xdpsock: Add VLAN support for Tx-only operation
  libbpf 1.0: Deprecate bpf_object__find_map_by_offset() API
  libbpf 1.0: Deprecate bpf_map__is_offload_neutral()
  ...
====================

Link: https://lore.kernel.org/r/20220107013626.53943-1-alexei.starovoitov@gmail.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 710ad98c eff14fcd
Loading
Loading
Loading
Loading
+91 −65
Original line number Diff line number Diff line
@@ -19,23 +19,37 @@ The eBPF calling convention is defined as:
R0 - R5 are scratch registers and eBPF programs needs to spill/fill them if
necessary across calls.

Instruction encoding
====================

eBPF uses 64-bit instructions with the following encoding:

 =============  =======  ===============  ====================  ============
 32 bits (MSB)  16 bits  4 bits           4 bits                8 bits (LSB)
 =============  =======  ===============  ====================  ============
 immediate      offset   source register  destination register  opcode
 =============  =======  ===============  ====================  ============

Note that most instructions do not use all of the fields.
Unused fields shall be cleared to zero.

Instruction classes
===================
-------------------

The three LSB bits of the 'opcode' field store the instruction class:

  ========= =====
  class     value
  ========= =====
  BPF_LD    0x00
  BPF_LDX   0x01
  BPF_ST    0x02
  BPF_STX   0x03
  BPF_ALU   0x04
  BPF_JMP   0x05
  BPF_JMP32 0x06
  BPF_ALU64 0x07
  ========= =====
  =========  =====  ===============================
  class      value  description
  =========  =====  ===============================
  BPF_LD     0x00   non-standard load operations
  BPF_LDX    0x01   load into register operations
  BPF_ST     0x02   store from immediate operations
  BPF_STX    0x03   store from register operations
  BPF_ALU    0x04   32-bit arithmetic operations
  BPF_JMP    0x05   64-bit jump operations
  BPF_JMP32  0x06   32-bit jump operations
  BPF_ALU64  0x07   64-bit arithmetic operations
  =========  =====  ===============================

Arithmetic and jump instructions
================================
@@ -60,66 +74,78 @@ The 4th bit encodes the source operand:

The four MSB bits store the operation code.

For class BPF_ALU or BPF_ALU64:

  ========  =====  =========================
Arithmetic instructions
-----------------------

BPF_ALU uses 32-bit wide operands while BPF_ALU64 uses 64-bit wide operands for
otherwise identical operations.
The code field encodes the operation as below:

  ========  =====  ==========================
  code      value  description
  ========  =====  =========================
  BPF_ADD   0x00
  BPF_SUB   0x10
  BPF_MUL   0x20
  BPF_DIV   0x30
  BPF_OR    0x40
  BPF_AND   0x50
  BPF_LSH   0x60
  BPF_RSH   0x70
  BPF_NEG   0x80
  BPF_MOD   0x90
  BPF_XOR   0xa0
  BPF_MOV   0xb0   mov reg to reg
  ========  =====  ==========================
  BPF_ADD   0x00   dst += src
  BPF_SUB   0x10   dst -= src
  BPF_MUL   0x20   dst \*= src
  BPF_DIV   0x30   dst /= src
  BPF_OR    0x40   dst \|= src
  BPF_AND   0x50   dst &= src
  BPF_LSH   0x60   dst <<= src
  BPF_RSH   0x70   dst >>= src
  BPF_NEG   0x80   dst = ~src
  BPF_MOD   0x90   dst %= src
  BPF_XOR   0xa0   dst ^= src
  BPF_MOV   0xb0   dst = src
  BPF_ARSH  0xc0   sign extending shift right
  BPF_END   0xd0   endianness conversion
  ========  =====  =========================
  ========  =====  ==========================

For class BPF_JMP or BPF_JMP32:
BPF_ADD | BPF_X | BPF_ALU means::

  ========  =====  =========================
  code      value  description
  ========  =====  =========================
  BPF_JA    0x00   BPF_JMP only
  BPF_JEQ   0x10
  BPF_JGT   0x20
  BPF_JGE   0x30
  BPF_JSET  0x40
  BPF_JNE   0x50   jump '!='
  BPF_JSGT  0x60   signed '>'
  BPF_JSGE  0x70   signed '>='
  BPF_CALL  0x80   function call
  BPF_EXIT  0x90   function return
  BPF_JLT   0xa0   unsigned '<'
  BPF_JLE   0xb0   unsigned '<='
  BPF_JSLT  0xc0   signed '<'
  BPF_JSLE  0xd0   signed '<='
  ========  =====  =========================
  dst_reg = (u32) dst_reg + (u32) src_reg;

So BPF_ADD | BPF_X | BPF_ALU means::
BPF_ADD | BPF_X | BPF_ALU64 means::

  dst_reg = (u32) dst_reg + (u32) src_reg;
  dst_reg = dst_reg + src_reg

Similarly, BPF_XOR | BPF_K | BPF_ALU means::
BPF_XOR | BPF_K | BPF_ALU means::

  src_reg = (u32) src_reg ^ (u32) imm32

eBPF is using BPF_MOV | BPF_X | BPF_ALU to represent A = B moves.  BPF_ALU64
is used to mean exactly the same operations as BPF_ALU, but with 64-bit wide
operands instead. So BPF_ADD | BPF_X | BPF_ALU64 means 64-bit addition, i.e.::
BPF_XOR | BPF_K | BPF_ALU64 means::

  dst_reg = dst_reg + src_reg
  src_reg = src_reg ^ imm32


Jump instructions
-----------------

BPF_JMP32 uses 32-bit wide operands while BPF_JMP uses 64-bit wide operands for
otherwise identical operations.
The code field encodes the operation as below:

  ========  =====  =========================  ============
  code      value  description                notes
  ========  =====  =========================  ============
  BPF_JA    0x00   PC += off                  BPF_JMP only
  BPF_JEQ   0x10   PC += off if dst == src
  BPF_JGT   0x20   PC += off if dst > src     unsigned
  BPF_JGE   0x30   PC += off if dst >= src    unsigned
  BPF_JSET  0x40   PC += off if dst & src
  BPF_JNE   0x50   PC += off if dst != src
  BPF_JSGT  0x60   PC += off if dst > src     signed
  BPF_JSGE  0x70   PC += off if dst >= src    signed
  BPF_CALL  0x80   function call
  BPF_EXIT  0x90   function / program return  BPF_JMP only
  BPF_JLT   0xa0   PC += off if dst < src     unsigned
  BPF_JLE   0xb0   PC += off if dst <= src    unsigned
  BPF_JSLT  0xc0   PC += off if dst < src     signed
  BPF_JSLE  0xd0   PC += off if dst <= src    signed
  ========  =====  =========================  ============

BPF_JMP | BPF_EXIT means function exit only. The eBPF program needs to store
the return value into register R0 before doing a BPF_EXIT. Class 6 is used as
BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide
operands for the comparisons instead.
The eBPF program needs to store the return value into register R0 before doing a
BPF_EXIT.


Load and store instructions
@@ -147,15 +173,15 @@ The size modifier is one of:

The mode modifier is one of:

  =============  =====  =====================
  =============  =====  ====================================
  mode modifier  value  description
  =============  =====  =====================
  =============  =====  ====================================
  BPF_IMM        0x00   used for 64-bit mov
  BPF_ABS        0x20
  BPF_IND        0x40
  BPF_MEM        0x60
  BPF_ABS        0x20   legacy BPF packet access
  BPF_IND        0x40   legacy BPF packet access
  BPF_MEM        0x60   all normal load and store operations
  BPF_ATOMIC     0xc0   atomic operations
  =============  =====  =====================
  =============  =====  ====================================

BPF_MEM | <size> | BPF_STX means::

+4 −1
Original line number Diff line number Diff line
@@ -792,6 +792,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
		u64 imm64;

		imm64 = (u64)insn1.imm << 32 | (u32)imm;
		if (bpf_pseudo_func(insn))
			emit_addr_mov_i64(dst, imm64, ctx);
		else
			emit_a64_mov_i64(dst, imm64, ctx);

		return 1;
+10 −10
Original line number Diff line number Diff line
@@ -1669,17 +1669,17 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
struct btf *bpf_get_btf_vmlinux(void);

/* Map specifics */
struct xdp_buff;
struct xdp_frame;
struct sk_buff;
struct bpf_dtab_netdev;
struct bpf_cpu_map_entry;

void __dev_flush(void);
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
		    struct net_device *dev_rx);
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
		    struct net_device *dev_rx);
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
			  struct bpf_map *map, bool exclude_ingress);
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
			     struct bpf_prog *xdp_prog);
@@ -1688,7 +1688,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
			   bool exclude_ingress);

void __cpu_map_flush(void);
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
		    struct net_device *dev_rx);
int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
			     struct sk_buff *skb);
@@ -1866,26 +1866,26 @@ static inline void __dev_flush(void)
{
}

struct xdp_buff;
struct xdp_frame;
struct bpf_dtab_netdev;
struct bpf_cpu_map_entry;

static inline
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
		    struct net_device *dev_rx)
{
	return 0;
}

static inline
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
		    struct net_device *dev_rx)
{
	return 0;
}

static inline
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
			  struct bpf_map *map, bool exclude_ingress)
{
	return 0;
@@ -1913,7 +1913,7 @@ static inline void __cpu_map_flush(void)
}

static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
				  struct xdp_buff *xdp,
				  struct xdp_frame *xdpf,
				  struct net_device *dev_rx)
{
	return 0;
+4 −0
Original line number Diff line number Diff line
@@ -1019,6 +1019,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
int xdp_do_redirect(struct net_device *dev,
		    struct xdp_buff *xdp,
		    struct bpf_prog *prog);
int xdp_do_redirect_frame(struct net_device *dev,
			  struct xdp_buff *xdp,
			  struct xdp_frame *xdpf,
			  struct bpf_prog *prog);
void xdp_do_flush(void);

/* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as
+9 −2
Original line number Diff line number Diff line
@@ -80,6 +80,8 @@ struct page_pool_params {
	enum dma_data_direction dma_dir; /* DMA mapping direction */
	unsigned int	max_len; /* max DMA sync memory size */
	unsigned int	offset;  /* DMA addr offset */
	void (*init_callback)(struct page *page, void *arg);
	void *init_arg;
};

struct page_pool {
@@ -94,6 +96,7 @@ struct page_pool {
	unsigned int frag_offset;
	struct page *frag_page;
	long frag_users;
	u32 xdp_mem_id;

	/*
	 * Data structure for allocation side
@@ -168,9 +171,12 @@ bool page_pool_return_skb_page(struct page *page);

struct page_pool *page_pool_create(const struct page_pool_params *params);

struct xdp_mem_info;

#ifdef CONFIG_PAGE_POOL
void page_pool_destroy(struct page_pool *pool);
void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *));
void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
			   struct xdp_mem_info *mem);
void page_pool_release_page(struct page_pool *pool, struct page *page);
void page_pool_put_page_bulk(struct page_pool *pool, void **data,
			     int count);
@@ -180,7 +186,8 @@ static inline void page_pool_destroy(struct page_pool *pool)
}

static inline void page_pool_use_xdp_mem(struct page_pool *pool,
					 void (*disconnect)(void *))
					 void (*disconnect)(void *),
					 struct xdp_mem_info *mem)
{
}
static inline void page_pool_release_page(struct page_pool *pool,
Loading