Commit 35dfb013 authored by Andrew Sy Kim's avatar Andrew Sy Kim Committed by Pablo Neira Ayuso
Browse files

ipvs: queue delayed work to expire no destination connections if expire_nodest_conn=1



When expire_nodest_conn=1 and a destination is deleted, IPVS does not
expire the existing connections until the next matching incoming packet.
If there are many connection entries from a single client to a single
destination, many packets may get dropped before all the connections are
expired (more likely with lots of UDP traffic). An optimization can be
made where upon deletion of a destination, IPVS queues up delayed work
to immediately expire any connections with a deleted destination. This
ensures any reused source ports from a client (within the IPVS timeouts)
are scheduled to new real servers instead of silently dropped.

Signed-off-by: default avatarAndrew Sy Kim <kim.andrewsy@gmail.com>
Signed-off-by: default avatarJulian Anastasov <ja@ssi.bg>
Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
parent 336f531a
Loading
Loading
Loading
Loading
+29 −0
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@
#include <linux/spinlock.h>             /* for struct rwlock_t */
#include <linux/atomic.h>               /* for struct atomic_t */
#include <linux/refcount.h>             /* for struct refcount_t */
#include <linux/workqueue.h>

#include <linux/compiler.h>
#include <linux/timer.h>
@@ -886,6 +887,8 @@ struct netns_ipvs {
	atomic_t		conn_out_counter;

#ifdef CONFIG_SYSCTL
	/* delayed work for expiring no dest connections */
	struct delayed_work	expire_nodest_conn_work;
	/* 1/rate drop and drop-entry variables */
	struct delayed_work	defense_work;   /* Work handler */
	int			drop_rate;
@@ -1051,6 +1054,11 @@ static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
	return ipvs->sysctl_conn_reuse_mode;
}

static inline int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
{
	return ipvs->sysctl_expire_nodest_conn;
}

static inline int sysctl_schedule_icmp(struct netns_ipvs *ipvs)
{
	return ipvs->sysctl_schedule_icmp;
@@ -1138,6 +1146,11 @@ static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
	return 1;
}

static inline int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
{
	return 0;
}

static inline int sysctl_schedule_icmp(struct netns_ipvs *ipvs)
{
	return 0;
@@ -1507,6 +1520,22 @@ static inline int ip_vs_todrop(struct netns_ipvs *ipvs)
static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { return 0; }
#endif

#ifdef CONFIG_SYSCTL
/* Enqueue delayed work for expiring no dest connections
 * Only run when sysctl_expire_nodest=1
 */
static inline void ip_vs_enqueue_expire_nodest_conns(struct netns_ipvs *ipvs)
{
	if (sysctl_expire_nodest_conn(ipvs))
		queue_delayed_work(system_long_wq,
				   &ipvs->expire_nodest_conn_work, 1);
}

void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs);
#else
static inline void ip_vs_enqueue_expire_nodest_conns(struct netns_ipvs *ipvs) {}
#endif

#define IP_VS_DFWD_METHOD(dest) (atomic_read(&(dest)->conn_flags) & \
				 IP_VS_CONN_F_FWD_MASK)

+39 −0
Original line number Diff line number Diff line
@@ -1389,6 +1389,45 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
		goto flush_again;
	}
}

#ifdef CONFIG_SYSCTL
void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
{
	int idx;
	struct ip_vs_conn *cp, *cp_c;
	struct ip_vs_dest *dest;

	rcu_read_lock();
	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
			if (cp->ipvs != ipvs)
				continue;

			dest = cp->dest;
			if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE))
				continue;

			if (atomic_read(&cp->n_control))
				continue;

			cp_c = cp->control;
			IP_VS_DBG(4, "del connection\n");
			ip_vs_conn_del(cp);
			if (cp_c && !atomic_read(&cp_c->n_control)) {
				IP_VS_DBG(4, "del controlling connection\n");
				ip_vs_conn_del(cp_c);
			}
		}
		cond_resched_rcu();

		/* netns clean up started, abort delayed work */
		if (!ipvs->enable)
			break;
	}
	rcu_read_unlock();
}
#endif

/*
 * per netns init and exit
 */
+20 −27
Original line number Diff line number Diff line
@@ -694,16 +694,10 @@ static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs)
	return ipvs->sysctl_nat_icmp_send;
}

static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
{
	return ipvs->sysctl_expire_nodest_conn;
}

#else

static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; }
static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; }
static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }

#endif

@@ -2097,36 +2091,35 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
		}
	}

	if (unlikely(!cp)) {
		int v;

		if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
			return v;
	}

	IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");

	/* Check the server status */
	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
	if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
		/* the destination server is not available */
		if (sysctl_expire_nodest_conn(ipvs)) {
			bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);

		__u32 flags = cp->flags;

		/* when timer already started, silently drop the packet.*/
		if (timer_pending(&cp->timer))
			__ip_vs_conn_put(cp);
		else
			ip_vs_conn_put(cp);
			if (!old_ct)
				cp->flags &= ~IP_VS_CONN_F_NFCT;

		if (sysctl_expire_nodest_conn(ipvs) &&
		    !(flags & IP_VS_CONN_F_ONE_PACKET)) {
			/* try to expire the connection immediately */
			ip_vs_conn_expire_now(cp);
			__ip_vs_conn_put(cp);
			if (old_ct)
				return NF_DROP;
			cp = NULL;
		} else {
			__ip_vs_conn_put(cp);
			return NF_DROP;
		}
	}

		return NF_DROP;
	if (unlikely(!cp)) {
		int v;

		if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
			return v;
	}

	IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");

	ip_vs_in_stats(cp, skb);
	ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
	if (cp->packet_xmit)
+22 −0
Original line number Diff line number Diff line
@@ -210,6 +210,17 @@ static void update_defense_level(struct netns_ipvs *ipvs)
	local_bh_enable();
}

/* Handler for delayed work for expiring no
 * destination connections
 */
static void expire_nodest_conn_handler(struct work_struct *work)
{
	struct netns_ipvs *ipvs;

	ipvs = container_of(work, struct netns_ipvs,
			    expire_nodest_conn_work.work);
	ip_vs_expire_nodest_conn_flush(ipvs);
}

/*
 *	Timer for checking the defense
@@ -1164,6 +1175,12 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
	list_add(&dest->t_list, &ipvs->dest_trash);
	dest->idle_start = 0;
	spin_unlock_bh(&ipvs->dest_trash_lock);

	/* Queue up delayed work to expire all no destination connections.
	 * No-op when CONFIG_SYSCTL is disabled.
	 */
	if (!cleanup)
		ip_vs_enqueue_expire_nodest_conns(ipvs);
}


@@ -4086,6 +4103,10 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
	queue_delayed_work(system_long_wq, &ipvs->defense_work,
			   DEFENSE_TIMER_PERIOD);

	/* Init delayed work for expiring no dest conn */
	INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
			  expire_nodest_conn_handler);

	return 0;
}

@@ -4093,6 +4114,7 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
{
	struct net *net = ipvs->net;

	cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work);
	cancel_delayed_work_sync(&ipvs->defense_work);
	cancel_work_sync(&ipvs->defense_work.work);
	unregister_net_sysctl_table(ipvs->sysctl_hdr);