Commit ee3398c7 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'net-skb-defer-freeing-polish'



Eric Dumazet says:

====================
net: polish skb defer freeing

While testing this recently added feature on a variety
of platforms/configurations, I found the following issues:

1) A race leading to concurrent calls to smp_call_function_single_async()

2) Missed opportunity to use napi_consume_skb()

3) Need to limit the max length of the per-cpu lists.

4) Process the per-cpu list more frequently, for the
   (unusual) case where net_rx_action() has mutiple
   napi_poll() to process per round.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3daebfbe 90987650
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -322,6 +322,14 @@ a leaked reference faster. A larger value may be useful to prevent false
warnings on slow/loaded systems.
Default value is 10, minimum 1, maximum 3600.

skb_defer_max
-------------

Max size (in skbs) of the per-cpu list of skbs being freed
by the cpu which allocated them. Used by TCP stack so far.

Default: 64

optmem_max
----------

+1 −0
Original line number Diff line number Diff line
@@ -3136,6 +3136,7 @@ struct softnet_data {
	/* Another possibly contended cache line */
	spinlock_t		defer_lock ____cacheline_aligned_in_smp;
	int			defer_count;
	int			defer_ipi_scheduled;
	struct sk_buff		*defer_list;
	call_single_data_t	defer_csd;
};
+10 −5
Original line number Diff line number Diff line
@@ -4330,6 +4330,7 @@ int netdev_max_backlog __read_mostly = 1000;
EXPORT_SYMBOL(netdev_max_backlog);

int netdev_tstamp_prequeue __read_mostly = 1;
unsigned int sysctl_skb_defer_max __read_mostly = 64;
int netdev_budget __read_mostly = 300;
/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
@@ -4582,9 +4583,12 @@ static void rps_trigger_softirq(void *data)
#endif /* CONFIG_RPS */

/* Called from hardirq (IPI) context */
static void trigger_rx_softirq(void *data __always_unused)
static void trigger_rx_softirq(void *data)
{
	struct softnet_data *sd = data;

	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
	smp_store_release(&sd->defer_ipi_scheduled, 0);
}

/*
@@ -6630,7 +6634,7 @@ static void skb_defer_free_flush(struct softnet_data *sd)

	while (skb != NULL) {
		next = skb->next;
		__kfree_skb(skb);
		napi_consume_skb(skb, 1);
		skb = next;
	}
}
@@ -6651,6 +6655,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
	for (;;) {
		struct napi_struct *n;

		skb_defer_free_flush(sd);

		if (list_empty(&list)) {
			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
				goto end;
@@ -6680,8 +6686,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
		__raise_softirq_irqoff(NET_RX_SOFTIRQ);

	net_rps_action_and_irq_enable(sd);
end:
	skb_defer_free_flush(sd);
end:;
}

struct netdev_adjacent {
@@ -11382,7 +11387,7 @@ static int __init net_dev_init(void)
		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
		sd->cpu = i;
#endif
		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL);
		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
		spin_lock_init(&sd->defer_lock);

		init_gro_hash(&sd->backlog);
+1 −1
Original line number Diff line number Diff line
@@ -39,7 +39,7 @@ void dev_addr_check(struct net_device *dev);
/* sysctls not referred to from outside net/core/ */
extern int		netdev_budget;
extern unsigned int	netdev_budget_usecs;

extern unsigned int	sysctl_skb_defer_max;
extern int		netdev_tstamp_prequeue;
extern int		netdev_unregister_timeout_secs;
extern int		weight_p;
+10 −8
Original line number Diff line number Diff line
@@ -80,6 +80,7 @@
#include <linux/user_namespace.h>
#include <linux/indirect_call_wrapper.h>

#include "dev.h"
#include "sock_destructor.h"

struct kmem_cache *skbuff_head_cache __ro_after_init;
@@ -6496,16 +6497,21 @@ void skb_attempt_defer_free(struct sk_buff *skb)
	int cpu = skb->alloc_cpu;
	struct softnet_data *sd;
	unsigned long flags;
	unsigned int defer_max;
	bool kick;

	if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
	    !cpu_online(cpu) ||
	    cpu == raw_smp_processor_id()) {
		__kfree_skb(skb);
nodefer:	__kfree_skb(skb);
		return;
	}

	sd = &per_cpu(softnet_data, cpu);
	defer_max = READ_ONCE(sysctl_skb_defer_max);
	if (READ_ONCE(sd->defer_count) >= defer_max)
		goto nodefer;

	/* We do not send an IPI or any signal.
	 * Remote cpu will eventually call skb_defer_free_flush()
	 */
@@ -6515,18 +6521,14 @@ void skb_attempt_defer_free(struct sk_buff *skb)
	WRITE_ONCE(sd->defer_list, skb);
	sd->defer_count++;

	/* kick every time queue length reaches 128.
	 * This should avoid blocking in smp_call_function_single_async().
	 * This condition should hardly be bit under normal conditions,
	 * unless cpu suddenly stopped to receive NIC interrupts.
	 */
	kick = sd->defer_count == 128;
	/* Send an IPI every time queue reaches half capacity. */
	kick = sd->defer_count == (defer_max >> 1);

	spin_unlock_irqrestore(&sd->defer_lock, flags);

	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
	 * if we are unlucky enough (this seems very unlikely).
	 */
	if (unlikely(kick))
	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
		smp_call_function_single_async(cpu, &sd->defer_csd);
}
Loading