Commit 64ba5634 authored by Liu Jian's avatar Liu Jian
Browse files

net: add some bpf hooks in tcp stack for network numa relationship

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ


CVE: NA

--------------------------------

And add sysctl net.core.numa_rship_ms to control frequency.
Add bpf_sched_net_rship_submit bpf helper function to submit the
relationship to schedule subsystem.

Signed-off-by: default avatarLiu Jian <liujian56@huawei.com>
parent e79b3265
Loading
Loading
Loading
Loading
+19 −4
Original line number Diff line number Diff line
@@ -1477,11 +1477,21 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
#ifdef CONFIG_BPF_NET_GLOBAL_PROG
struct bpf_gnet_ctx_kern {
	struct sock *sk;
	int curr_tid;
	int peer_tid;
	int numa_node;
	__u64 rxtx_bytes;
	int rx_dev_idx;
	int rx_dev_queue_idx;
	__u64 rx_dev_netns_cookie;
};

enum gnet_bpf_attach_type {
	GNET_BPF_ATTACH_TYPE_INVALID = -1,
	GNET_RESERVE0 = 0,
	GNET_TCP_RECVMSG = 0,
	GNET_SK_DST_SET,
	GNET_RCV_NIC_NODE,
	GNET_SEND_NIC_NODE,
	MAX_GNET_BPF_ATTACH_TYPE
};

@@ -1489,9 +1499,14 @@ static inline enum gnet_bpf_attach_type
to_gnet_bpf_attach_type(enum bpf_attach_type attach_type)
{
	switch (attach_type) {
	GNET_ATYPE(GNET_RESERVE0);
	case BPF_GNET_RESERVE0:
		return GNET_RESERVE0;
	case BPF_GNET_TCP_RECVMSG:
		return GNET_TCP_RECVMSG;
	case BPF_GNET_SK_DST_SET:
		return GNET_SK_DST_SET;
	case BPF_GNET_RCV_NIC_NODE:
		return GNET_RCV_NIC_NODE;
	case BPF_GNET_SEND_NIC_NODE:
		return GNET_SEND_NIC_NODE;
	default:
	return GNET_BPF_ATTACH_TYPE_INVALID;
	}
+4 −0
Original line number Diff line number Diff line
@@ -924,7 +924,11 @@ struct sk_buff {
	/* public: */

	KABI_USE2(1, __u8 scm_io_uring:1, __u8 local_skb:1)
#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP)
	KABI_USE(2, struct sched_net_rship_skb *net_rship)
#else
	KABI_RESERVE(2)
#endif
	KABI_RESERVE(3)
	KABI_RESERVE(4)

+329 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Common code for task relationship aware
 *
 * Copyright (C) 2024 Huawei Technologies Co., Ltd
 *
 */

#ifndef __LINUX_NET_RSHIP_H__
#define __LINUX_NET_RSHIP_H__

#include <linux/types.h>
#include <linux/jiffies.h>
#include <linux/socket.h>
#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/net.h>
#include <linux/interrupt.h>
#include <linux/static_key.h>

#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/filter.h>

#ifdef CONFIG_SCHED_TASK_RELATIONSHIP

struct sched_net_rship_skb {
	/* for loopback traffic */
	pid_t alloc_tid;

	/* for phy nic */
	union {
		u32 rx_dev_idx; /* rx */
		int dev_numa_node; /* tx */
	};
	u16 alloc_cpu;
	u16 rx_queue_idx;
	u64 rx_dev_net_cookie;
};

struct sk_buff_fclones_net_rship {
	struct sk_buff_fclones fclones;
	struct sched_net_rship_skb ext1;
	struct sched_net_rship_skb ext2;
};

struct sk_buff_net_rship {
	struct sk_buff skb;
	struct sched_net_rship_skb ext;
};

struct sched_net_rship_sock {
	/* for loopback traffic */
	pid_t sk_peer_tid;
	u64 tid_rx_bytes;
	unsigned long last_rx_update;

	/* for recv from phy nic */
	int rcv_numa_node;
	u64 rcv_numa_node_bytes;
	unsigned long last_rcv_numa_node_update;

	/* for send to phy nic */
	pid_t sk_send_tid;
	int send_numa_node;
	u64 send_numa_node_bytes;
	unsigned long last_send_numa_node_update;
};
#endif

#if defined(CONFIG_SCHED_TASK_RELATIONSHIP) && defined(CONFIG_BPF_NET_GLOBAL_PROG)

#define NET_RSHIP_HEAD_RESERVE	40
extern unsigned long net_numa_rship_jiffies;

static inline void net_rship_sock_init(struct sock *sk, unsigned int offset)
{
	sk->net_rship = (void *)(((char *)sk) + offset);
	memset(sk->net_rship, 0, sizeof(struct sched_net_rship_sock));
	sk->net_rship->rcv_numa_node = NUMA_NO_NODE;
	sk->net_rship->send_numa_node = NUMA_NO_NODE;
}

static inline struct sched_net_rship_skb *__get_skb_net_rship(struct sk_buff *skb)
{
	return skb->net_rship;
}

static inline bool net_rship_refresh_timeout(unsigned long last_update)
{
	return time_after(jiffies, net_numa_rship_jiffies + last_update);
}

static inline void net_rship_sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
	if (!gnet_bpf_enabled(GNET_SK_DST_SET))
		return;

	if (!in_task() || !dst)
		return;

	if (dev_to_node(&dst->dev->dev) != NUMA_NO_NODE) {
		struct bpf_gnet_ctx_kern ctx = {0};

		ctx.numa_node = dev_to_node(&dst->dev->dev);
		if (sk->net_rship->sk_send_tid)
			ctx.curr_tid = sk->net_rship->sk_send_tid;
		else
			ctx.curr_tid = task_pid_nr(current);
		ctx.sk = sk;
		run_gnet_bpf(GNET_SK_DST_SET, &ctx);
	}
}

static inline void __net_rship_tcp_rcvmsg(struct sock *sk, pid_t tid)
{
	struct bpf_gnet_ctx_kern ctx = {0};

	ctx.sk = sk;
	ctx.curr_tid = task_pid_nr(current);
	ctx.peer_tid = tid;
	ctx.rxtx_bytes = sk->net_rship->tid_rx_bytes;
	sk->net_rship->last_rx_update = jiffies;
	run_gnet_bpf(GNET_TCP_RECVMSG, &ctx);
	sk->net_rship->tid_rx_bytes = 0;
}

static inline void net_rship_tcp_local(struct sock *sk, struct sk_buff *skb)
{
	struct sched_net_rship_skb *ext;

	if (!gnet_bpf_enabled(GNET_TCP_RECVMSG))
		return;

	ext = __get_skb_net_rship(skb);
	if (!ext->alloc_tid)
		return;

	if (sk->net_rship->sk_peer_tid != ext->alloc_tid) {
		sk->net_rship->sk_peer_tid = ext->alloc_tid;
		sk->net_rship->tid_rx_bytes = skb->len + NET_RSHIP_HEAD_RESERVE;
		__net_rship_tcp_rcvmsg(sk, ext->alloc_tid);
	} else {
		sk->net_rship->tid_rx_bytes += (skb->len + NET_RSHIP_HEAD_RESERVE);
		if (net_rship_refresh_timeout(sk->net_rship->last_rx_update))
			__net_rship_tcp_rcvmsg(sk, ext->alloc_tid);
	}
}

static inline void net_rship_recv_nic_node(struct sock *sk, struct sk_buff *skb)
{
	struct sched_net_rship_skb *ext;

	if (!gnet_bpf_enabled(GNET_RCV_NIC_NODE))
		return;

	ext = __get_skb_net_rship(skb);
	if (ext->alloc_tid || ext->rx_dev_idx == -1)
		return;

	sk->net_rship->rcv_numa_node_bytes += (skb->len + NET_RSHIP_HEAD_RESERVE);
	if (net_rship_refresh_timeout(sk->net_rship->last_rcv_numa_node_update)) {
		struct bpf_gnet_ctx_kern ctx = {0};

		ctx.sk = sk;
		ctx.curr_tid = task_pid_nr(current);
		ctx.numa_node = cpu_to_node(ext->alloc_cpu);
		ctx.rxtx_bytes = sk->net_rship->rcv_numa_node_bytes;
		ctx.rx_dev_idx = ext->rx_dev_idx;
		ctx.rx_dev_queue_idx = skb_get_rx_queue(skb);
		ctx.rx_dev_netns_cookie = ext->rx_dev_net_cookie;
		run_gnet_bpf(GNET_RCV_NIC_NODE, &ctx);
		sk->net_rship->last_rcv_numa_node_update = jiffies;
		sk->net_rship->rcv_numa_node_bytes = 0;
	}
}

static inline void net_rship_tcp_recvmsg(struct sock *sk, struct sk_buff *skb)
{
	net_rship_tcp_local(sk, skb);
	net_rship_recv_nic_node(sk, skb);
}

static inline void net_rship_send_nic_node(struct sock *sk, struct sk_buff *skb)
{
	struct sched_net_rship_skb *ext;

	if (!gnet_bpf_enabled(GNET_SEND_NIC_NODE))
		return;

	ext = __get_skb_net_rship(skb);
	if ((ext->dev_numa_node != NUMA_NO_NODE) &&
			sk->net_rship->sk_send_tid) {
		sk->net_rship->send_numa_node_bytes += skb->len;
		if (net_rship_refresh_timeout(sk->net_rship->last_send_numa_node_update)) {
			struct bpf_gnet_ctx_kern ctx = {0};

			ctx.sk = sk;
			ctx.curr_tid = sk->net_rship->sk_send_tid;
			ctx.rxtx_bytes = sk->net_rship->send_numa_node_bytes;
			ctx.numa_node = ext->dev_numa_node;

			run_gnet_bpf(GNET_SEND_NIC_NODE, &ctx);
			sk->net_rship->send_numa_node_bytes = 0;
			sk->net_rship->last_send_numa_node_update = jiffies;
		}
	}
}

static inline void net_rship_skb_record_dev_numa_node(struct sk_buff *skb, struct net_device *dev)
{
	if (gnet_bpf_enabled(GNET_SEND_NIC_NODE)) {
		struct sched_net_rship_skb *ext = __get_skb_net_rship(skb);

		ext->dev_numa_node = dev_to_node(&dev->dev);
	}
}

static inline void net_rship_skb_record_dev_rxinfo(struct sk_buff *skb, struct net_device *dev)
{
	if (gnet_bpf_enabled(GNET_RCV_NIC_NODE)) {
		struct sched_net_rship_skb *ext = __get_skb_net_rship(skb);

		ext->rx_dev_idx = dev->ifindex;
		ext->rx_dev_net_cookie = dev_net(dev)->net_cookie;
	}
}

static inline void __net_rship_skb_clear(struct sched_net_rship_skb *ext)
{
	ext->alloc_tid = 0;
	/* dev_name_node and rx_dev_idx */
	ext->dev_numa_node = NUMA_NO_NODE;
}

static inline void net_rship_skb_clear(struct sk_buff *skb)
{
	struct sched_net_rship_skb *ext = __get_skb_net_rship(skb);

	__net_rship_skb_clear(ext);
}

static inline void __net_rship_skb_init(struct sk_buff *skb)
{
	__net_rship_skb_clear(skb->net_rship);
	skb->net_rship->alloc_cpu = raw_smp_processor_id();
}

static inline void net_rship_skb_init(struct sk_buff *skb)
{
	struct sk_buff_net_rship *rskb = (void *)skb;

	skb->net_rship = &rskb->ext;
	__net_rship_skb_init(skb);
}

static inline void net_rship_skb_init_flags(struct sk_buff *skb, int flags)
{
	if (flags & SKB_ALLOC_FCLONE) {
		struct sk_buff_fclones_net_rship *rskbs;

		rskbs = (void *)container_of(skb, struct sk_buff_fclones, skb1);
		skb->net_rship = &rskbs->ext1;
		rskbs->fclones.skb2.net_rship = &rskbs->ext2;

		__net_rship_skb_init(skb);
		__net_rship_skb_init(&rskbs->fclones.skb2);
	} else
		net_rship_skb_init(skb);
}

static inline void net_rship_skb_clone(struct sk_buff *n, struct sk_buff *skb)
{
	n->net_rship->alloc_tid = skb->net_rship->alloc_tid;
}

/* Make sure it is a process context */
static inline void net_rship_record_sendmsginfo(struct sk_buff *skb, struct sock *sk)
{
	if (gnet_bpf_enabled(GNET_TCP_RECVMSG) || gnet_bpf_enabled(GNET_RCV_NIC_NODE)) {
		struct sched_net_rship_skb *ext = __get_skb_net_rship(skb);

		ext->alloc_tid = task_pid_nr(current);
	}
	if (gnet_bpf_enabled(GNET_SK_DST_SET) || gnet_bpf_enabled(GNET_SEND_NIC_NODE))
		sk->net_rship->sk_send_tid = task_pid_nr(current);
}

#else

static inline void net_rship_sock_init(struct sock *sk, unsigned int offset)
{}

static inline void net_rship_sk_dst_set(struct sock *sk, struct dst_entry *dst)
{}

static inline void net_rship_tcp_recvmsg(struct sock *sk, struct sk_buff *skb)
{}

static inline void net_rship_send_nic_node(struct sock *sk, struct sk_buff *skb)
{}

static inline void net_rship_skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
{}

static inline void net_rship_skb_record_dev_numa_node(struct sk_buff *skb, struct net_device *dev)
{}

static inline void net_rship_skb_record_dev_rxinfo(struct sk_buff *skb, struct net_device *dev)
{}

static inline void net_rship_skb_clear(struct sk_buff *skb)
{}

static inline void net_rship_skb_init(struct sk_buff *skb)
{}

static inline void net_rship_skb_init_flags(struct sk_buff *skb, int flags)
{}

static inline void net_rship_skb_clone(struct sk_buff *n, struct sk_buff *skb)
{}

static inline void net_rship_record_sendmsginfo(struct sk_buff *skb, struct sock *sk)
{}
#endif

#endif
+4 −0
Original line number Diff line number Diff line
@@ -533,7 +533,11 @@ struct sock {
#else
	KABI_RESERVE(1)
#endif
#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP)
	KABI_USE(2, struct sched_net_rship_sock *net_rship)
#else
	KABI_RESERVE(2)
#endif
	KABI_RESERVE(3)
	KABI_RESERVE(4)
	KABI_RESERVE(5)
+18 −1
Original line number Diff line number Diff line
@@ -246,7 +246,10 @@ enum bpf_attach_type {
	BPF_XDP,
#ifndef __GENKSYMS__
	BPF_SCHED,
	BPF_GNET_RESERVE0,
	BPF_GNET_TCP_RECVMSG,
	BPF_GNET_SK_DST_SET,
	BPF_GNET_RCV_NIC_NODE,
	BPF_GNET_SEND_NIC_NODE,
#endif
	__MAX_BPF_ATTACH_TYPE
};
@@ -3922,6 +3925,12 @@ union bpf_attr {
 *		get resource statistics of *nid* and store in *ctx*.
 *	Return
 *		0 on success, or a negative error in case of failure.
 *
 * int bpf_sched_net_rship_submit(void *buf, size_t sz, u64 flags)
 *	Description
 *		update network's relationship to sched subsystem.
 *	Return
 *		0 on success, or a negative error in case of failure.
 */
#define __BPF_FUNC_MAPPER(FN)		\
	FN(unspec),			\
@@ -4098,6 +4107,7 @@ union bpf_attr {
	FN(get_task_relationship_stats),\
	FN(sched_set_curr_preferred_node),\
	FN(get_node_stats),		\
	FN(sched_net_rship_submit),	\
	/* */

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5254,6 +5264,13 @@ enum {

struct bpf_gnet_ctx {
	__bpf_md_ptr(struct bpf_sock *, sk);
	int curr_tid;
	int peer_tid;
	int numa_node;
	__u64 rxtx_bytes;
	int rx_dev_idx;
	int rx_dev_queue_idx;
	__u64 rx_dev_netns_cookie;
};

#endif /* _UAPI__LINUX_BPF_H__ */
Loading