Commit e10b02ee authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'net-reduce-tcp_memory_allocated-inflation'

Eric Dumazet says:

====================
net: reduce tcp_memory_allocated inflation

Hosts with a lot of sockets tend to hit so called TCP memory pressure,
leading to very bad TCP performance and/or OOM.

The problem is that some TCP sockets can hold up to 2MB of 'forward
allocations' in their per-socket cache (sk->sk_forward_alloc),
and there is no mechanism to make them relinquish their share
under mem pressure.
Only under some potentially rare events their share is reclaimed,
one socket at a time.

In this series, I implemented a per-cpu cache instead of a per-socket one.

Each CPU has a +1/-1 MB (256 pages on x86) forward alloc cache, in order
to not dirty tcp_memory_allocated shared cache line too often.

We keep sk->sk_forward_alloc values as small as possible, to meet
memcg page granularity constraint.

Note that memcg already has a per-cpu cache, although MEMCG_CHARGE_BATCH
is defined to 32 pages, which seems a bit small.

Note that while this cover letter mentions TCP, this work is generic
and supports TCP, UDP, DECNET, SCTP.
====================

Link: https://lore.kernel.org/r/20220609063412.2205738-1-eric.dumazet@gmail.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 5c281b4e 0f2c2693
Loading
Loading
Loading
Loading
+44 −56
Original line number Diff line number Diff line
@@ -1254,6 +1254,7 @@ struct proto {
	void			(*enter_memory_pressure)(struct sock *sk);
	void			(*leave_memory_pressure)(struct sock *sk);
	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
	int  __percpu		*per_cpu_fw_alloc;
	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */

	/*
@@ -1396,22 +1397,48 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
	return !!*sk->sk_prot->memory_pressure;
}

static inline long
proto_memory_allocated(const struct proto *prot)
{
	return max(0L, atomic_long_read(prot->memory_allocated));
}

static inline long
sk_memory_allocated(const struct sock *sk)
{
	return atomic_long_read(sk->sk_prot->memory_allocated);
	return proto_memory_allocated(sk->sk_prot);
}

/* 1 MB per cpu, in page units */
#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))

static inline long
sk_memory_allocated_add(struct sock *sk, int amt)
{
	return atomic_long_add_return(amt, sk->sk_prot->memory_allocated);
	int local_reserve;

	preempt_disable();
	local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
	if (local_reserve >= SK_MEMORY_PCPU_RESERVE) {
		__this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
		atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
	}
	preempt_enable();
	return sk_memory_allocated(sk);
}

static inline void
sk_memory_allocated_sub(struct sock *sk, int amt)
{
	atomic_long_sub(amt, sk->sk_prot->memory_allocated);
	int local_reserve;

	preempt_disable();
	local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
	if (local_reserve <= -SK_MEMORY_PCPU_RESERVE) {
		__this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
		atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
	}
	preempt_enable();
}

#define SK_ALLOC_PERCPU_COUNTER_BATCH 16
@@ -1440,12 +1467,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot)
	return percpu_counter_sum_positive(prot->sockets_allocated);
}

static inline long
proto_memory_allocated(struct proto *prot)
{
	return atomic_long_read(prot->memory_allocated);
}

static inline bool
proto_memory_pressure(struct proto *prot)
{
@@ -1532,30 +1553,18 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind);
void __sk_mem_reduce_allocated(struct sock *sk, int amount);
void __sk_mem_reclaim(struct sock *sk, int amount);

/* We used to have PAGE_SIZE here, but systems with 64KB pages
 * do not necessarily have 16x time more memory than 4KB ones.
 */
#define SK_MEM_QUANTUM 4096
#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
#define SK_MEM_SEND	0
#define SK_MEM_RECV	1

/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
/* sysctl_mem values are in pages */
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
{
	long val = sk->sk_prot->sysctl_mem[index];

#if PAGE_SIZE > SK_MEM_QUANTUM
	val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
#elif PAGE_SIZE < SK_MEM_QUANTUM
	val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
#endif
	return val;
	return sk->sk_prot->sysctl_mem[index];
}

static inline int sk_mem_pages(int amt)
{
	return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
	return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT;
}

static inline bool sk_has_account(struct sock *sk)
@@ -1566,19 +1575,23 @@ static inline bool sk_has_account(struct sock *sk)

static inline bool sk_wmem_schedule(struct sock *sk, int size)
{
	int delta;

	if (!sk_has_account(sk))
		return true;
	return size <= sk->sk_forward_alloc ||
		__sk_mem_schedule(sk, size, SK_MEM_SEND);
	delta = size - sk->sk_forward_alloc;
	return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
}

static inline bool
sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
{
	int delta;

	if (!sk_has_account(sk))
		return true;
	return size <= sk->sk_forward_alloc ||
		__sk_mem_schedule(sk, size, SK_MEM_RECV) ||
	delta = size - sk->sk_forward_alloc;
	return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
		skb_pfmemalloc(skb);
}

@@ -1604,7 +1617,7 @@ static inline void sk_mem_reclaim(struct sock *sk)

	reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);

	if (reclaimable >= SK_MEM_QUANTUM)
	if (reclaimable >= (int)PAGE_SIZE)
		__sk_mem_reclaim(sk, reclaimable);
}

@@ -1614,19 +1627,6 @@ static inline void sk_mem_reclaim_final(struct sock *sk)
	sk_mem_reclaim(sk);
}

static inline void sk_mem_reclaim_partial(struct sock *sk)
{
	int reclaimable;

	if (!sk_has_account(sk))
		return;

	reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);

	if (reclaimable > SK_MEM_QUANTUM)
		__sk_mem_reclaim(sk, reclaimable - 1);
}

static inline void sk_mem_charge(struct sock *sk, int size)
{
	if (!sk_has_account(sk))
@@ -1634,29 +1634,17 @@ static inline void sk_mem_charge(struct sock *sk, int size)
	sk->sk_forward_alloc -= size;
}

/* the following macros control memory reclaiming in sk_mem_uncharge()
/* the following macros control memory reclaiming in mptcp_rmem_uncharge()
 */
#define SK_RECLAIM_THRESHOLD	(1 << 21)
#define SK_RECLAIM_CHUNK	(1 << 20)

static inline void sk_mem_uncharge(struct sock *sk, int size)
{
	int reclaimable;

	if (!sk_has_account(sk))
		return;
	sk->sk_forward_alloc += size;
	reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);

	/* Avoid a possible overflow.
	 * TCP send queues can make this happen, if sk_mem_reclaim()
	 * is not called and more than 2 GBytes are released at once.
	 *
	 * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
	 * no need to hold that much forward allocation anyway.
	 */
	if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD))
		__sk_mem_reclaim(sk, SK_RECLAIM_CHUNK);
	sk_mem_reclaim(sk);
}

/*
+2 −0
Original line number Diff line number Diff line
@@ -253,6 +253,8 @@ extern long sysctl_tcp_mem[3];
#define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */

extern atomic_long_t tcp_memory_allocated;
DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);

extern struct percpu_counter tcp_sockets_allocated;
extern unsigned long tcp_memory_pressure;

+1 −0
Original line number Diff line number Diff line
@@ -95,6 +95,7 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
extern struct proto udp_prot;

extern atomic_long_t udp_memory_allocated;
DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);

/* sysctl variables for udp */
extern long sysctl_udp_mem[3];
+0 −3
Original line number Diff line number Diff line
@@ -320,7 +320,6 @@ EXPORT_SYMBOL(skb_recv_datagram);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
	consume_skb(skb);
	sk_mem_reclaim_partial(sk);
}
EXPORT_SYMBOL(skb_free_datagram);

@@ -336,7 +335,6 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
	slow = lock_sock_fast(sk);
	sk_peek_offset_bwd(sk, len);
	skb_orphan(skb);
	sk_mem_reclaim_partial(sk);
	unlock_sock_fast(sk, slow);

	/* skb is now orphaned, can be freed outside of locked section */
@@ -396,7 +394,6 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
				      NULL);

	kfree_skb(skb);
	sk_mem_reclaim_partial(sk);
	return err;
}
EXPORT_SYMBOL(skb_kill_datagram);
+12 −10
Original line number Diff line number Diff line
@@ -991,7 +991,7 @@ EXPORT_SYMBOL(sock_set_mark);
static void sock_release_reserved_memory(struct sock *sk, int bytes)
{
	/* Round down bytes to multiple of pages */
	bytes &= ~(SK_MEM_QUANTUM - 1);
	bytes = round_down(bytes, PAGE_SIZE);

	WARN_ON(bytes > sk->sk_reserved_mem);
	sk->sk_reserved_mem -= bytes;
@@ -1028,9 +1028,9 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
		return -ENOMEM;
	}
	sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
	sk->sk_forward_alloc += pages << PAGE_SHIFT;

	sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
	sk->sk_reserved_mem += pages << PAGE_SHIFT;

	return 0;
}
@@ -2987,7 +2987,6 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)

	return 0;
}
EXPORT_SYMBOL(__sk_mem_raise_allocated);

/**
 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
@@ -3003,10 +3002,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
{
	int ret, amt = sk_mem_pages(size);

	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
	sk->sk_forward_alloc += amt << PAGE_SHIFT;
	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
	if (!ret)
		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
		sk->sk_forward_alloc -= amt << PAGE_SHIFT;
	return ret;
}
EXPORT_SYMBOL(__sk_mem_schedule);
@@ -3029,17 +3028,16 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount)
	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
		sk_leave_memory_pressure(sk);
}
EXPORT_SYMBOL(__sk_mem_reduce_allocated);

/**
 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
 *	@sk: socket
 *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
 *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
 */
void __sk_mem_reclaim(struct sock *sk, int amount)
{
	amount >>= SK_MEM_QUANTUM_SHIFT;
	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
	amount >>= PAGE_SHIFT;
	sk->sk_forward_alloc -= amount << PAGE_SHIFT;
	__sk_mem_reduce_allocated(sk, amount);
}
EXPORT_SYMBOL(__sk_mem_reclaim);
@@ -3798,6 +3796,10 @@ int proto_register(struct proto *prot, int alloc_slab)
		pr_err("%s: missing sysctl_mem\n", prot->name);
		return -EINVAL;
	}
	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
		return -EINVAL;
	}
	if (alloc_slab) {
		prot->slab = kmem_cache_create_usercopy(prot->name,
					prot->obj_size, 0,
Loading