Commit 705dd344 authored by Julian Anastasov's avatar Julian Anastasov Committed by Pablo Neira Ayuso
Browse files

ipvs: use kthreads for stats estimation

Estimating all entries in single list in timer context
by single CPU causes large latency with multiple IPVS rules
as reported in [1], [2], [3].

Spread the estimator structures in multiple chains and
use kthread(s) for the estimation. The chains are processed
in multiple (50) timer ticks to ensure the 2-second interval
between estimations with some accuracy. Every chain is
processed under RCU lock.

Every kthread works over its own data structure and all
such contexts are attached to array. The contexts can be
preserved while the kthread tasks are stopped or restarted.
When estimators are removed, unused kthread contexts are
released and the slots in array are left empty.

First kthread determines parameters to use, eg. maximum
number of estimators to process per kthread based on
chain's length (chain_max), allowing sub-100us cond_resched
rate and estimation taking up to 1/8 of the CPU capacity
to avoid any problems if chain_max is not correctly
calculated.

chain_max is calculated taking into account factors
such as CPU speed and memory/cache speed where the
cache_factor (4) is selected from real tests with
current generation of CPU/NUMA configurations to
correct the difference in CPU usage between
cached (during calc phase) and non-cached (working) state
of the estimated per-cpu data.

First kthread also plays the role of distributor of
added estimators to all kthreads, keeping low the
time to add estimators. The optimization is based on
the fact that newly added estimator should be estimated
after 2 seconds, so we have the time to offload the
adding to chain from controlling process to kthread 0.

The allocated kthread context may grow from 1 to 50
allocated structures for timer ticks which saves memory for
setups with small number of estimators.

We also add delayed work est_reload_work that will
make sure the kthread tasks are properly started/stopped.

ip_vs_start_estimator() is changed to report errors
which allows to safely store the estimators in
allocated structures.

Many thanks to Jiri Wiesner for his valuable comments
and for spending a lot of time reviewing and testing
the changes on different platforms with 48-256 CPUs and
1-8 NUMA nodes under different cpufreq governors.

[1] Report from Yunhong Jiang:
https://lore.kernel.org/netdev/D25792C1-1B89-45DE-9F10-EC350DC04ADC@gmail.com/
[2]
https://marc.info/?l=linux-virtual-server&m=159679809118027&w=2
[3] Report from Dust:
https://archive.linuxvirtualserver.org/html/lvs-devel/2020-12/msg00000.html



Signed-off-by: default avatarJulian Anastasov <ja@ssi.bg>
Cc: yunhong-cgl jiang <xintian1976@gmail.com>
Cc: "dust.li" <dust.li@linux.alibaba.com>
Reviewed-by: default avatarJiri Wiesner <jwiesner@suse.de>
Tested-by: default avatarJiri Wiesner <jwiesner@suse.de>
Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
parent 1dbd8d9a
Loading
Loading
Loading
Loading
+83 −5
Original line number Diff line number Diff line
@@ -42,6 +42,8 @@ static inline struct netns_ipvs *net_ipvs(struct net* net)
/* Connections' size value needed by ip_vs_ctl.c */
extern int ip_vs_conn_tab_size;

extern struct mutex __ip_vs_mutex;

struct ip_vs_iphdr {
	int hdr_flags;	/* ipvs flags */
	__u32 off;	/* Where IP or IPv4 header starts */
@@ -365,7 +367,7 @@ struct ip_vs_cpu_stats {

/* IPVS statistics objects */
struct ip_vs_estimator {
	struct list_head	list;
	struct hlist_node	list;

	u64			last_inbytes;
	u64			last_outbytes;
@@ -378,6 +380,10 @@ struct ip_vs_estimator {
	u64			outpps;
	u64			inbps;
	u64			outbps;

	s32			ktid:16,	/* kthread ID, -1=temp list */
				ktrow:8,	/* row/tick ID for kthread */
				ktcid:8;	/* chain ID for kthread tick */
};

/*
@@ -415,6 +421,66 @@ struct ip_vs_stats *ip_vs_stats_alloc(void);
void ip_vs_stats_release(struct ip_vs_stats *stats);
void ip_vs_stats_free(struct ip_vs_stats *stats);

/* Process estimators in multiple timer ticks (20/50/100, see ktrow) */
#define IPVS_EST_NTICKS		50
/* Estimation uses a 2-second period containing ticks (in jiffies) */
#define IPVS_EST_TICK		((2 * HZ) / IPVS_EST_NTICKS)

/* Limit of CPU load per kthread (8 for 12.5%), ratio of CPU capacity (1/C).
 * Value of 4 and above ensures kthreads will take work without exceeding
 * the CPU capacity under different circumstances.
 */
#define IPVS_EST_LOAD_DIVISOR	8

/* Kthreads should not have work that exceeds the CPU load above 50% */
#define IPVS_EST_CPU_KTHREADS	(IPVS_EST_LOAD_DIVISOR / 2)

/* Desired number of chains per timer tick (chain load factor in 100us units),
 * 48=4.8ms of 40ms tick (12% CPU usage):
 * 2 sec * 1000 ms in sec * 10 (100us in ms) / 8 (12.5%) / 50
 */
#define IPVS_EST_CHAIN_FACTOR	\
	ALIGN_DOWN(2 * 1000 * 10 / IPVS_EST_LOAD_DIVISOR / IPVS_EST_NTICKS, 8)

/* Compiled number of chains per tick
 * The defines should match cond_resched_rcu
 */
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
#define IPVS_EST_TICK_CHAINS	IPVS_EST_CHAIN_FACTOR
#else
#define IPVS_EST_TICK_CHAINS	1
#endif

#if IPVS_EST_NTICKS > 127
#error Too many timer ticks for ktrow
#endif

/* Multiple chains processed in same tick */
struct ip_vs_est_tick_data {
	struct hlist_head	chains[IPVS_EST_TICK_CHAINS];
	DECLARE_BITMAP(present, IPVS_EST_TICK_CHAINS);
	DECLARE_BITMAP(full, IPVS_EST_TICK_CHAINS);
	int			chain_len[IPVS_EST_TICK_CHAINS];
};

/* Context for estimation kthread */
struct ip_vs_est_kt_data {
	struct netns_ipvs	*ipvs;
	struct task_struct	*task;		/* task if running */
	struct ip_vs_est_tick_data __rcu *ticks[IPVS_EST_NTICKS];
	DECLARE_BITMAP(avail, IPVS_EST_NTICKS);	/* tick has space for ests */
	unsigned long		est_timer;	/* estimation timer (jiffies) */
	struct ip_vs_stats	*calc_stats;	/* Used for calculation */
	int			tick_len[IPVS_EST_NTICKS];	/* est count */
	int			id;		/* ktid per netns */
	int			chain_max;	/* max ests per tick chain */
	int			tick_max;	/* max ests per tick */
	int			est_count;	/* attached ests to kthread */
	int			est_max_count;	/* max ests per kthread */
	int			add_row;	/* row for new ests */
	int			est_row;	/* estimated row */
};

struct dst_entry;
struct iphdr;
struct ip_vs_conn;
@@ -953,9 +1019,17 @@ struct netns_ipvs {
	struct ctl_table_header	*lblcr_ctl_header;
	struct ctl_table	*lblcr_ctl_table;
	/* ip_vs_est */
	struct list_head	est_list;	/* estimator list */
	spinlock_t		est_lock;
	struct timer_list	est_timer;	/* Estimation timer */
	struct delayed_work	est_reload_work;/* Reload kthread tasks */
	struct mutex		est_mutex;	/* protect kthread tasks */
	struct hlist_head	est_temp_list;	/* Ests during calc phase */
	struct ip_vs_est_kt_data **est_kt_arr;	/* Array of kthread data ptrs */
	unsigned long		est_max_threads;/* Hard limit of kthreads */
	int			est_calc_phase;	/* Calculation phase */
	int			est_chain_max;	/* Calculated chain_max */
	int			est_kt_count;	/* Allocated ptrs */
	int			est_add_ktid;	/* ktid where to add ests */
	atomic_t		est_genid;	/* kthreads reload genid */
	atomic_t		est_genid_done;	/* applied genid */
	/* ip_vs_sync */
	spinlock_t		sync_lock;
	struct ipvs_master_sync_state *ms;
@@ -1486,10 +1560,14 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state);
void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts);

/* IPVS rate estimator prototypes (from ip_vs_est.c) */
void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats);
int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats);
void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats);
void ip_vs_zero_estimator(struct ip_vs_stats *stats);
void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats);
void ip_vs_est_reload_start(struct netns_ipvs *ipvs);
int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
			    struct ip_vs_est_kt_data *kd);
void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd);

/* Various IPVS packet transmitters (from ip_vs_xmit.c) */
int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+99 −27
Original line number Diff line number Diff line
@@ -49,8 +49,7 @@

MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);

/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
static DEFINE_MUTEX(__ip_vs_mutex);
DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */

/* sysctl variables */

@@ -241,6 +240,47 @@ static void defense_work_handler(struct work_struct *work)
}
#endif

static void est_reload_work_handler(struct work_struct *work)
{
	struct netns_ipvs *ipvs =
		container_of(work, struct netns_ipvs, est_reload_work.work);
	int genid_done = atomic_read(&ipvs->est_genid_done);
	unsigned long delay = HZ / 10;	/* repeat startups after failure */
	bool repeat = false;
	int genid;
	int id;

	mutex_lock(&ipvs->est_mutex);
	genid = atomic_read(&ipvs->est_genid);
	for (id = 0; id < ipvs->est_kt_count; id++) {
		struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];

		/* netns clean up started, abort delayed work */
		if (!ipvs->enable)
			goto unlock;
		if (!kd)
			continue;
		/* New config ? Stop kthread tasks */
		if (genid != genid_done)
			ip_vs_est_kthread_stop(kd);
		if (!kd->task) {
			/* Do not start kthreads above 0 in calc phase */
			if ((!id || !ipvs->est_calc_phase) &&
			    ip_vs_est_kthread_start(ipvs, kd) < 0)
				repeat = true;
		}
	}

	atomic_set(&ipvs->est_genid_done, genid);

	if (repeat)
		queue_delayed_work(system_long_wq, &ipvs->est_reload_work,
				   delay);

unlock:
	mutex_unlock(&ipvs->est_mutex);
}

int
ip_vs_use_count_inc(void)
{
@@ -831,7 +871,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
{
#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c

	spin_lock_bh(&src->lock);
	spin_lock(&src->lock);

	IP_VS_SHOW_STATS_COUNTER(conns);
	IP_VS_SHOW_STATS_COUNTER(inpkts);
@@ -841,7 +881,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)

	ip_vs_read_estimator(dst, src);

	spin_unlock_bh(&src->lock);
	spin_unlock(&src->lock);
}

static void
@@ -862,7 +902,7 @@ ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
static void
ip_vs_zero_stats(struct ip_vs_stats *stats)
{
	spin_lock_bh(&stats->lock);
	spin_lock(&stats->lock);

	/* get current counters as zero point, rates are zeroed */

@@ -876,7 +916,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)

	ip_vs_zero_estimator(stats);

	spin_unlock_bh(&stats->lock);
	spin_unlock(&stats->lock);
}

/* Allocate fields after kzalloc */
@@ -998,7 +1038,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
	spin_unlock_bh(&dest->dst_lock);

	if (add) {
		ip_vs_start_estimator(svc->ipvs, &dest->stats);
		list_add_rcu(&dest->n_list, &svc->destinations);
		svc->num_dests++;
		sched = rcu_dereference_protected(svc->scheduler, 1);
@@ -1051,6 +1090,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
	if (ret < 0)
		goto err_alloc;

	ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
	if (ret < 0)
		goto err_stats;

	dest->af = udest->af;
	dest->protocol = svc->protocol;
	dest->vaddr = svc->addr;
@@ -1071,6 +1114,9 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
	LeaveFunction(2);
	return 0;

err_stats:
	ip_vs_stats_release(&dest->stats);

err_alloc:
	kfree(dest);
	return ret;
@@ -1135,14 +1181,18 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
			      ntohs(dest->vport));

		ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
		if (ret < 0)
			goto err;
		__ip_vs_update_dest(svc, dest, udest, 1);
		ret = 0;
	} else {
		/*
		 * Allocate and initialize the dest structure
		 */
		ret = ip_vs_new_dest(svc, udest);
	}

err:
	LeaveFunction(2);

	return ret;
@@ -1420,6 +1470,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
		sched = NULL;
	}

	ret = ip_vs_start_estimator(ipvs, &svc->stats);
	if (ret < 0)
		goto out_err;

	/* Bind the ct retriever */
	RCU_INIT_POINTER(svc->pe, pe);
	pe = NULL;
@@ -1432,8 +1486,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
	if (svc->pe && svc->pe->conn_out)
		atomic_inc(&ipvs->conn_out_counter);

	ip_vs_start_estimator(ipvs, &svc->stats);

	/* Count only IPv4 services for old get/setsockopt interface */
	if (svc->af == AF_INET)
		ipvs->num_services++;
@@ -1444,8 +1496,15 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
	ip_vs_svc_hash(svc);

	*svc_p = svc;

	if (!ipvs->enable) {
		/* Now there is a service - full throttle */
		ipvs->enable = 1;

		/* Start estimation for first time */
		ip_vs_est_reload_start(ipvs);
	}

	return 0;


@@ -4065,13 +4124,16 @@ static void ip_vs_genl_unregister(void)
static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
{
	struct net *net = ipvs->net;
	int idx;
	struct ctl_table *tbl;
	int idx, ret;

	atomic_set(&ipvs->dropentry, 0);
	spin_lock_init(&ipvs->dropentry_lock);
	spin_lock_init(&ipvs->droppacket_lock);
	spin_lock_init(&ipvs->securetcp_lock);
	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
	INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
			  expire_nodest_conn_handler);

	if (!net_eq(net, &init_net)) {
		tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
@@ -4139,24 +4201,27 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
		tbl[idx++].mode = 0444;
#endif

	ret = -ENOMEM;
	ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
	if (ipvs->sysctl_hdr == NULL) {
		if (!net_eq(net, &init_net))
			kfree(tbl);
		return -ENOMEM;
	}
	if (!ipvs->sysctl_hdr)
		goto err;
	ipvs->sysctl_tbl = tbl;

	ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
	if (ret < 0)
		goto err;

	/* Schedule defense work */
	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
	queue_delayed_work(system_long_wq, &ipvs->defense_work,
			   DEFENSE_TIMER_PERIOD);

	/* Init delayed work for expiring no dest conn */
	INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
			  expire_nodest_conn_handler);

	ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
	return 0;

err:
	unregister_net_sysctl_table(ipvs->sysctl_hdr);
	if (!net_eq(net, &init_net))
		kfree(tbl);
	return ret;
}

static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
@@ -4189,6 +4254,7 @@ static struct notifier_block ip_vs_dst_notifier = {

int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
{
	int ret = -ENOMEM;
	int idx;

	/* Initialize rs_table */
@@ -4202,10 +4268,12 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
	atomic_set(&ipvs->nullsvc_counter, 0);
	atomic_set(&ipvs->conn_out_counter, 0);

	INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);

	/* procfs stats */
	ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL);
	if (!ipvs->tot_stats)
		return -ENOMEM;
		goto out;
	if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
		goto err_tot_stats;

@@ -4222,7 +4290,8 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
		goto err_percpu;
#endif

	if (ip_vs_control_net_init_sysctl(ipvs))
	ret = ip_vs_control_net_init_sysctl(ipvs);
	if (ret < 0)
		goto err;

	return 0;
@@ -4243,13 +4312,16 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)

err_tot_stats:
	kfree(ipvs->tot_stats);
	return -ENOMEM;

out:
	return ret;
}

void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
{
	ip_vs_trash_cleanup(ipvs);
	ip_vs_control_net_cleanup_sysctl(ipvs);
	cancel_delayed_work_sync(&ipvs->est_reload_work);
#ifdef CONFIG_PROC_FS
	remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
	remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
+808 −68

File changed.

Preview size limit exceeded, changes collapsed.