Commit 19757ceb authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

tcp: switch orphan_count to bare per-cpu counters



Use of percpu_counter structure to track count of orphaned
sockets is causing problems on modern hosts with 256 cpus
or more.

Stefan Bach reported a serious spinlock contention in real workloads,
that I was able to reproduce with a netfilter rule dropping
incoming FIN packets.

    53.56%  server  [kernel.kallsyms]      [k] queued_spin_lock_slowpath
            |
            ---queued_spin_lock_slowpath
               |
                --53.51%--_raw_spin_lock_irqsave
                          |
                           --53.51%--__percpu_counter_sum
                                     tcp_check_oom
                                     |
                                     |--39.03%--__tcp_close
                                     |          tcp_close
                                     |          inet_release
                                     |          inet6_release
                                     |          sock_close
                                     |          __fput
                                     |          ____fput
                                     |          task_work_run
                                     |          exit_to_usermode_loop
                                     |          do_syscall_64
                                     |          entry_SYSCALL_64_after_hwframe
                                     |          __GI___libc_close
                                     |
                                      --14.48%--tcp_out_of_resources
                                                tcp_write_timeout
                                                tcp_retransmit_timer
                                                tcp_write_timer_handler
                                                tcp_write_timer
                                                call_timer_fn
                                                expire_timers
                                                __run_timers
                                                run_timer_softirq
                                                __softirqentry_text_start

As explained in commit cf86a086 ("net/dst: use a smaller percpu_counter
batch for dst entries accounting"), default batch size is too big
for the default value of tcp_max_orphans (262144).

But even if we reduce batch sizes, there would still be cases
where the estimated count of orphans is beyond the limit,
and where tcp_too_many_orphans() has to call the expensive
percpu_counter_sum_positive().

One solution is to use plain per-cpu counters, and have
a timer to periodically refresh this cache.

Updating this cache every 100ms seems about right, tcp pressure
state is not radically changing over shorter periods.

percpu_counter was nice 15 years ago while hosts had less
than 16 cpus, not anymore by current standards.

v2: Fix the build issue for CONFIG_CRYPTO_DEV_CHELSIO_TLS=m,
    reported by kernel test robot <lkp@intel.com>
    Remove unused socket argument from tcp_too_many_orphans()

Fixes: dd24c001 ("net: Use a percpu_counter for orphan_count")
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Reported-by: default avatarStefan Bach <sfb@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 0b93aed2
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -870,7 +870,7 @@ static void do_abort_syn_rcv(struct sock *child, struct sock *parent)
		 * created only after 3 way handshake is done.
		 */
		sock_orphan(child);
		percpu_counter_inc((child)->sk_prot->orphan_count);
		INC_ORPHAN_COUNT(child);
		chtls_release_resources(child);
		chtls_conn_done(child);
	} else {
+1 −1
Original line number Diff line number Diff line
@@ -95,7 +95,7 @@ struct deferred_skb_cb {
#define WSCALE_OK(tp) ((tp)->rx_opt.wscale_ok)
#define TSTAMP_OK(tp) ((tp)->rx_opt.tstamp_ok)
#define SACK_OK(tp) ((tp)->rx_opt.sack_ok)
#define INC_ORPHAN_COUNT(sk) percpu_counter_inc((sk)->sk_prot->orphan_count)
#define INC_ORPHAN_COUNT(sk) this_cpu_inc(*(sk)->sk_prot->orphan_count)

/* TLS SKB */
#define skb_ulp_tls_inline(skb)      (ULP_SKB_CB(skb)->ulp.tls.ofld)
+1 −1
Original line number Diff line number Diff line
@@ -289,7 +289,7 @@ static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk)
{
	/* The below has to be done to allow calling inet_csk_destroy_sock */
	sock_set_flag(sk, SOCK_DEAD);
	percpu_counter_inc(sk->sk_prot->orphan_count);
	this_cpu_inc(*sk->sk_prot->orphan_count);
}

void inet_csk_destroy_sock(struct sock *sk);
+1 −1
Original line number Diff line number Diff line
@@ -1235,7 +1235,7 @@ struct proto {
	unsigned int		useroffset;	/* Usercopy region offset */
	unsigned int		usersize;	/* Usercopy region size */

	struct percpu_counter	*orphan_count;
	unsigned int __percpu	*orphan_count;

	struct request_sock_ops	*rsk_prot;
	struct timewait_sock_ops *twsk_prot;
+3 −14
Original line number Diff line number Diff line
@@ -48,7 +48,9 @@

extern struct inet_hashinfo tcp_hashinfo;

extern struct percpu_counter tcp_orphan_count;
DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
int tcp_orphan_count_sum(void);

void tcp_time_wait(struct sock *sk, int state, int timeo);

#define MAX_TCP_HEADER	L1_CACHE_ALIGN(128 + MAX_HEADER)
@@ -290,19 +292,6 @@ static inline bool tcp_out_of_memory(struct sock *sk)

void sk_forced_mem_schedule(struct sock *sk, int size);

static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
{
	struct percpu_counter *ocp = sk->sk_prot->orphan_count;
	int orphans = percpu_counter_read_positive(ocp);

	if (orphans << shift > sysctl_tcp_max_orphans) {
		orphans = percpu_counter_sum_positive(ocp);
		if (orphans << shift > sysctl_tcp_max_orphans)
			return true;
	}
	return false;
}

bool tcp_check_oom(struct sock *sk, int shift);


Loading