Commit f1fdee33 authored by Andrii Nakryiko's avatar Andrii Nakryiko
Browse files

Merge branch 'sockmap fixes picked up by stress tests'



John Fastabend says:

====================

Running stress tests with recent patch to remove an extra lock in sockmap
resulted in a couple new issues popping up. It seems only one of them
is actually related to the patch:

799aa7f9 ("skmsg: Avoid lock_sock() in sk_psock_backlog()")

The other two issues had existed long before, but I guess the timing
with the serialization we had before was too tight to get any of
our tests or deployments to hit it.

With attached series stress testing sockmap+TCP with workloads that
create lots of short-lived connections no more splats like below were
seen on upstream bpf branch.

[224913.935822] WARNING: CPU: 3 PID: 32100 at net/core/stream.c:208 sk_stream_kill_queues+0x212/0x220
[224913.935841] Modules linked in: fuse overlay bpf_preload x86_pkg_temp_thermal intel_uncore wmi_bmof squashfs sch_fq_codel efivarfs ip_tables x_tables uas xhci_pci ixgbe mdio xfrm_algo xhci_hcd wmi
[224913.935897] CPU: 3 PID: 32100 Comm: fgs-bench Tainted: G          I       5.14.0-rc1alu+ #181
[224913.935908] Hardware name: Dell Inc. Precision 5820 Tower/002KVM, BIOS 1.9.2 01/24/2019
[224913.935914] RIP: 0010:sk_stream_kill_queues+0x212/0x220
[224913.935923] Code: 8b 83 20 02 00 00 85 c0 75 20 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 89 df e8 2b 11 fe ff eb c3 0f 0b e9 7c ff ff ff 0f 0b eb ce <0f> 0b 5b 5d 41 5c 41 5d 41 5e 41 5f c3 90 0f 1f 44 00 00 41 57 41
[224913.935932] RSP: 0018:ffff88816271fd38 EFLAGS: 00010206
[224913.935941] RAX: 0000000000000ae8 RBX: ffff88815acd5240 RCX: dffffc0000000000
[224913.935948] RDX: 0000000000000003 RSI: 0000000000000ae8 RDI: ffff88815acd5460
[224913.935954] RBP: ffff88815acd5460 R08: ffffffff955c0ae8 R09: fffffbfff2e6f543
[224913.935961] R10: ffffffff9737aa17 R11: fffffbfff2e6f542 R12: ffff88815acd5390
[224913.935967] R13: ffff88815acd5480 R14: ffffffff98d0c080 R15: ffffffff96267500
[224913.935974] FS:  00007f86e6bd1700(0000) GS:ffff888451cc0000(0000) knlGS:0000000000000000
[224913.935981] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[224913.935988] CR2: 000000c0008eb000 CR3: 00000001020e0005 CR4: 00000000003706e0
[224913.935994] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[224913.936000] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[224913.936007] Call Trace:
[224913.936016]  inet_csk_destroy_sock+0xba/0x1f0
[224913.936033]  __tcp_close+0x620/0x790
[224913.936047]  tcp_close+0x20/0x80
[224913.936056]  inet_release+0x8f/0xf0
[224913.936070]  __sock_release+0x72/0x120

v3: make sock_drop inline in skmsg.h
v2: init skb to null and fix a space/tab issue. Added Jakub's acks.
====================

Signed-off-by: default avatarAndrii Nakryiko <andrii@kernel.org>
parents d6371c76 9635720b
Loading
Loading
Loading
Loading
+35 −19
Original line number Diff line number Diff line
@@ -285,11 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk)
	return rcu_dereference_sk_user_data(sk);
}

static inline void sk_psock_set_state(struct sk_psock *psock,
				      enum sk_psock_state_bits bit)
{
	set_bit(bit, &psock->state);
}

static inline void sk_psock_clear_state(struct sk_psock *psock,
					enum sk_psock_state_bits bit)
{
	clear_bit(bit, &psock->state);
}

static inline bool sk_psock_test_state(const struct sk_psock *psock,
				       enum sk_psock_state_bits bit)
{
	return test_bit(bit, &psock->state);
}

static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
{
	sk_drops_add(sk, skb);
	kfree_skb(skb);
}

static inline void drop_sk_msg(struct sk_psock *psock, struct sk_msg *msg)
{
	if (msg->skb)
		sock_drop(psock->sk, msg->skb);
	kfree(msg);
}

static inline void sk_psock_queue_msg(struct sk_psock *psock,
				      struct sk_msg *msg)
{
	spin_lock_bh(&psock->ingress_lock);
	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
		list_add_tail(&msg->list, &psock->ingress_msg);
	else
		drop_sk_msg(psock, msg);
	spin_unlock_bh(&psock->ingress_lock);
}

@@ -406,24 +440,6 @@ static inline void sk_psock_restore_proto(struct sock *sk,
		psock->psock_update_sk_prot(sk, psock, true);
}

static inline void sk_psock_set_state(struct sk_psock *psock,
				      enum sk_psock_state_bits bit)
{
	set_bit(bit, &psock->state);
}

static inline void sk_psock_clear_state(struct sk_psock *psock,
					enum sk_psock_state_bits bit)
{
	clear_bit(bit, &psock->state);
}

static inline bool sk_psock_test_state(const struct sk_psock *psock,
				       enum sk_psock_state_bits bit)
{
	return test_bit(bit, &psock->state);
}

static inline struct sk_psock *sk_psock_get(struct sock *sk)
{
	struct sk_psock *psock;
+28 −11
Original line number Diff line number Diff line
@@ -584,29 +584,42 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
	return sk_psock_skb_ingress(psock, skb);
}

static void sock_drop(struct sock *sk, struct sk_buff *skb)
static void sk_psock_skb_state(struct sk_psock *psock,
			       struct sk_psock_work_state *state,
			       struct sk_buff *skb,
			       int len, int off)
{
	sk_drops_add(sk, skb);
	kfree_skb(skb);
	spin_lock_bh(&psock->ingress_lock);
	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
		state->skb = skb;
		state->len = len;
		state->off = off;
	} else {
		sock_drop(psock->sk, skb);
	}
	spin_unlock_bh(&psock->ingress_lock);
}

static void sk_psock_backlog(struct work_struct *work)
{
	struct sk_psock *psock = container_of(work, struct sk_psock, work);
	struct sk_psock_work_state *state = &psock->work_state;
	struct sk_buff *skb;
	struct sk_buff *skb = NULL;
	bool ingress;
	u32 len, off;
	int ret;

	mutex_lock(&psock->work_mutex);
	if (state->skb) {
	if (unlikely(state->skb)) {
		spin_lock_bh(&psock->ingress_lock);
		skb = state->skb;
		len = state->len;
		off = state->off;
		state->skb = NULL;
		goto start;
		spin_unlock_bh(&psock->ingress_lock);
	}
	if (skb)
		goto start;

	while ((skb = skb_dequeue(&psock->ingress_skb))) {
		len = skb->len;
@@ -621,9 +634,8 @@ static void sk_psock_backlog(struct work_struct *work)
							  len, ingress);
			if (ret <= 0) {
				if (ret == -EAGAIN) {
					state->skb = skb;
					state->len = len;
					state->off = off;
					sk_psock_skb_state(psock, state, skb,
							   len, off);
					goto end;
				}
				/* Hard errors break pipe and stop xmit. */
@@ -722,6 +734,11 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock)
		skb_bpf_redirect_clear(skb);
		sock_drop(psock->sk, skb);
	}
	kfree_skb(psock->work_state.skb);
	/* We null the skb here to ensure that calls to sk_psock_backlog
	 * do not pick up the free'd skb.
	 */
	psock->work_state.skb = NULL;
	__sk_psock_purge_ingress_msg(psock);
}

@@ -773,8 +790,6 @@ static void sk_psock_destroy(struct work_struct *work)

void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
	sk_psock_stop(psock, false);

	write_lock_bh(&sk->sk_callback_lock);
	sk_psock_restore_proto(sk, psock);
	rcu_assign_sk_user_data(sk, NULL);
@@ -784,6 +799,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
		sk_psock_stop_verdict(sk, psock);
	write_unlock_bh(&sk->sk_callback_lock);

	sk_psock_stop(psock, false);

	INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
	queue_rcu_work(system_wq, &psock->rwork);
}