Merge branch 'improving-TCP-behavior-on-host-congestion' (12ff91c8) · Commits · 方亚芬 / linux

net/ipv4/tcp_output.c

+19 −28

Original line number	Diff line number	Diff line
		@@ -980,7 +980,6 @@ static void tcp_update_skb_after_send(struct sock sk, struct sk_buff skb,
		{
		struct tcp_sock *tp = tcp_sk(sk);

		skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
		if (sk->sk_pacing_status != SK_PACING_NONE) {
		unsigned long rate = sk->sk_pacing_rate;

		@@ -1028,7 +1027,9 @@ static int __tcp_transmit_skb(struct sock sk, struct sk_buff skb,

		BUG_ON(!skb \|\| !tcp_skb_pcount(skb));
		tp = tcp_sk(sk);

		prior_wstamp = tp->tcp_wstamp_ns;
		tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
		skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
		if (clone_it) {
		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
		- tp->snd_una;
		@@ -1045,11 +1046,6 @@ static int __tcp_transmit_skb(struct sock sk, struct sk_buff skb,
		return -ENOBUFS;
		}

		prior_wstamp = tp->tcp_wstamp_ns;
		tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);

		skb->skb_mstamp_ns = tp->tcp_wstamp_ns;

		inet = inet_sk(sk);
		tcb = TCP_SKB_CB(skb);
		memset(&opts, 0, sizeof(opts));
		@@ -2937,12 +2933,16 @@ int __tcp_retransmit_skb(struct sock sk, struct sk_buff skb, int segs)
		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
		}

		/* To avoid taking spuriously low RTT samples based on a timestamp
		* for a transmit that never happened, always mark EVER_RETRANS
		*/
		TCP_SKB_CB(skb)->sacked \|= TCPCB_EVER_RETRANS;

		if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
		TCP_SKB_CB(skb)->seq, segs, err);

		if (likely(!err)) {
		TCP_SKB_CB(skb)->sacked \|= TCPCB_EVER_RETRANS;
		trace_tcp_retransmit_skb(sk, skb);
		} else if (err != -EBUSY) {
		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
		@@ -2963,13 +2963,12 @@ int tcp_retransmit_skb(struct sock sk, struct sk_buff skb, int segs)
		#endif
		TCP_SKB_CB(skb)->sacked \|= TCPCB_RETRANS;
		tp->retrans_out += tcp_skb_pcount(skb);
		}

		/* Save stamp of the first retransmit. */
		/* Save stamp of the first (attempted) retransmit. */
		if (!tp->retrans_stamp)
		tp->retrans_stamp = tcp_skb_timestamp(skb);

		}

		if (tp->undo_retrans < 0)
		tp->undo_retrans = 0;
		tp->undo_retrans += tcp_skb_pcount(skb);
		@@ -3750,7 +3749,7 @@ void tcp_send_probe0(struct sock *sk)
		struct inet_connection_sock *icsk = inet_csk(sk);
		struct tcp_sock *tp = tcp_sk(sk);
		struct net *net = sock_net(sk);
		unsigned long probe_max;
		unsigned long timeout;
		int err;

		err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
		@@ -3762,26 +3761,18 @@ void tcp_send_probe0(struct sock *sk)
		return;
		}

		icsk->icsk_probes_out++;
		if (err <= 0) {
		if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
		icsk->icsk_backoff++;
		icsk->icsk_probes_out++;
		probe_max = TCP_RTO_MAX;
		timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
		} else {
		/* If packet was not sent due to local congestion,
		* do not backoff and do not remember icsk_probes_out.
		* Let local senders to fight for local resources.
		*
		* Use accumulated backoff yet.
		* Let senders fight for local resources conservatively.
		*/
		if (!icsk->icsk_probes_out)
		icsk->icsk_probes_out = 1;
		probe_max = TCP_RESOURCE_PROBE_INTERVAL;
		timeout = TCP_RESOURCE_PROBE_INTERVAL;
		}
		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
		tcp_probe0_when(sk, probe_max),
		TCP_RTO_MAX,
		NULL);
		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL);
		}

		int tcp_rtx_synack(const struct sock sk, struct request_sock req)

net/ipv4/tcp_timer.c

+35 −48

Original line number	Diff line number	Diff line
		@@ -22,28 +22,14 @@
		#include <linux/gfp.h>
		#include <net/tcp.h>

		static u32 tcp_retransmit_stamp(const struct sock *sk)
		{
		u32 start_ts = tcp_sk(sk)->retrans_stamp;

		if (unlikely(!start_ts)) {
		struct sk_buff *head = tcp_rtx_queue_head(sk);

		if (!head)
		return 0;
		start_ts = tcp_skb_timestamp(head);
		}
		return start_ts;
		}

		static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
		{
		struct inet_connection_sock *icsk = inet_csk(sk);
		u32 elapsed, start_ts;
		s32 remaining;

		start_ts = tcp_retransmit_stamp(sk);
		if (!icsk->icsk_user_timeout \|\| !start_ts)
		start_ts = tcp_sk(sk)->retrans_stamp;
		if (!icsk->icsk_user_timeout)
		return icsk->icsk_rto;
		elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
		remaining = icsk->icsk_user_timeout - elapsed;
		@@ -173,7 +159,20 @@ static void tcp_mtu_probing(struct inet_connection_sock icsk, struct sock sk)
		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
		}

		static unsigned int tcp_model_timeout(struct sock *sk,
		unsigned int boundary,
		unsigned int rto_base)
		{
		unsigned int linear_backoff_thresh, timeout;

		linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base);
		if (boundary <= linear_backoff_thresh)
		timeout = ((2 << boundary) - 1) * rto_base;
		else
		timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
		(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
		return jiffies_to_msecs(timeout);
		}
		/**
		* retransmits_timed_out() - returns true if this connection has timed out
		* @sk: The current socket
		@@ -191,26 +190,15 @@ static bool retransmits_timed_out(struct sock *sk,
		unsigned int boundary,
		unsigned int timeout)
		{
		const unsigned int rto_base = TCP_RTO_MIN;
		unsigned int linear_backoff_thresh, start_ts;
		unsigned int start_ts;

		if (!inet_csk(sk)->icsk_retransmits)
		return false;

		start_ts = tcp_retransmit_stamp(sk);
		if (!start_ts)
		return false;
		start_ts = tcp_sk(sk)->retrans_stamp;
		if (likely(timeout == 0))
		timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN);

		if (likely(timeout == 0)) {
		linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);

		if (boundary <= linear_backoff_thresh)
		timeout = ((2 << boundary) - 1) * rto_base;
		else
		timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
		(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
		timeout = jiffies_to_msecs(timeout);
		}
		return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;
		}

		@@ -345,7 +333,6 @@ static void tcp_probe_timer(struct sock *sk)
		struct sk_buff *skb = tcp_send_head(sk);
		struct tcp_sock *tp = tcp_sk(sk);
		int max_probes;
		u32 start_ts;

		if (tp->packets_out \|\| !skb) {
		icsk->icsk_probes_out = 0;
		@@ -360,12 +347,13 @@ static void tcp_probe_timer(struct sock *sk)
		* corresponding system limit. We also implement similar policy when
		* we use RTO to probe window in tcp_retransmit_timer().
		*/
		start_ts = tcp_skb_timestamp(skb);
		if (!start_ts)
		skb->skb_mstamp_ns = tp->tcp_clock_cache;
		else if (icsk->icsk_user_timeout &&
		(s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
		if (icsk->icsk_user_timeout) {
		u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out,
		tcp_probe0_base(sk));

		if (elapsed >= icsk->icsk_user_timeout)
		goto abort;
		}

		max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
		if (sock_flag(sk, SOCK_DEAD)) {
		@@ -395,6 +383,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
		struct inet_connection_sock *icsk = inet_csk(sk);
		int max_retries = icsk->icsk_syn_retries ? :
		sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
		struct tcp_sock *tp = tcp_sk(sk);
		struct request_sock *req;

		req = tcp_sk(sk)->fastopen_rsk;
		@@ -412,6 +401,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
		inet_rtx_syn_ack(sk, req);
		req->num_timeout++;
		icsk->icsk_retransmits++;
		if (!tp->retrans_stamp)
		tp->retrans_stamp = tcp_time_stamp(tp);
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
		TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
		}
		@@ -443,10 +434,8 @@ void tcp_retransmit_timer(struct sock *sk)
		*/
		return;
		}
		if (!tp->packets_out)
		goto out;

		WARN_ON(tcp_rtx_queue_empty(sk));
		if (!tp->packets_out \|\| WARN_ON_ONCE(tcp_rtx_queue_empty(sk)))
		return;

		tp->tlp_high_seq = 0;

		@@ -511,14 +500,13 @@ void tcp_retransmit_timer(struct sock *sk)

		tcp_enter_loss(sk);

		icsk->icsk_retransmits++;
		if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
		/* Retransmission failed because of local congestion,
		* do not backoff.
		* Let senders fight for local resources conservatively.
		*/
		if (!icsk->icsk_retransmits)
		icsk->icsk_retransmits = 1;
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
		min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
		TCP_RESOURCE_PROBE_INTERVAL,
		TCP_RTO_MAX);
		goto out;
		}
		@@ -539,7 +527,6 @@ void tcp_retransmit_timer(struct sock *sk)
		* the 120 second clamps though!
		*/
		icsk->icsk_backoff++;
		icsk->icsk_retransmits++;

		out_reset_timer:
		/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is