sched/psi: Rearrange polling code in preparation (7fab21fa) · Commits · EulixOS / Software / Kernel

kernel/sched/psi.c

+98 −98

Original line number	Diff line number	Diff line
		@@ -384,92 +384,6 @@ static void collect_percpu_times(struct psi_group *group,
		*pchanged_states = changed_states;
		}

		static u64 update_averages(struct psi_group *group, u64 now)
		{
		unsigned long missed_periods = 0;
		u64 expires, period;
		u64 avg_next_update;
		int s;

		/* avgX= */
		expires = group->avg_next_update;
		if (now - expires >= psi_period)
		missed_periods = div_u64(now - expires, psi_period);

		/*
		* The periodic clock tick can get delayed for various
		* reasons, especially on loaded systems. To avoid clock
		* drift, we schedule the clock in fixed psi_period intervals.
		* But the deltas we sample out of the per-cpu buckets above
		* are based on the actual time elapsing between clock ticks.
		*/
		avg_next_update = expires + ((1 + missed_periods) * psi_period);
		period = now - (group->avg_last_update + (missed_periods * psi_period));
		group->avg_last_update = now;

		for (s = 0; s < NR_PSI_STATES - 1; s++) {
		u32 sample;

		sample = group->total[PSI_AVGS][s] - group->avg_total[s];
		/*
		* Due to the lockless sampling of the time buckets,
		* recorded time deltas can slip into the next period,
		* which under full pressure can result in samples in
		* excess of the period length.
		*
		* We don't want to report non-sensical pressures in
		* excess of 100%, nor do we want to drop such events
		* on the floor. Instead we punt any overage into the
		* future until pressure subsides. By doing this we
		* don't underreport the occurring pressure curve, we
		* just report it delayed by one period length.
		*
		* The error isn't cumulative. As soon as another
		* delta slips from a period P to P+1, by definition
		* it frees up its time T in P.
		*/
		if (sample > period)
		sample = period;
		group->avg_total[s] += sample;
		calc_avgs(group->avg[s], missed_periods, sample, period);
		}

		return avg_next_update;
		}

		static void psi_avgs_work(struct work_struct *work)
		{
		struct delayed_work *dwork;
		struct psi_group *group;
		u32 changed_states;
		u64 now;

		dwork = to_delayed_work(work);
		group = container_of(dwork, struct psi_group, avgs_work);

		mutex_lock(&group->avgs_lock);

		now = sched_clock();

		collect_percpu_times(group, PSI_AVGS, &changed_states);
		/*
		* If there is task activity, periodically fold the per-cpu
		* times and feed samples into the running averages. If things
		* are idle and there is no data to process, stop the clock.
		* Once restarted, we'll catch up the running averages in one
		* go - see calc_avgs() and missed_periods.
		*/
		if (now >= group->avg_next_update)
		group->avg_next_update = update_averages(group, now);

		if (changed_states & PSI_STATE_RESCHEDULE) {
		schedule_delayed_work(dwork, nsecs_to_jiffies(
		group->avg_next_update - now) + 1);
		}

		mutex_unlock(&group->avgs_lock);
		}

		/* Trigger tracking window manipulations */
		static void window_reset(struct psi_window *win, u64 now, u64 value,
		u64 prev_growth)
		@@ -516,18 +430,6 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
		return growth;
		}

		static void init_triggers(struct psi_group *group, u64 now)
		{
		struct psi_trigger *t;

		list_for_each_entry(t, &group->triggers, node)
		window_reset(&t->win, now,
		group->total[PSI_POLL][t->state], 0);
		memcpy(group->polling_total, group->total[PSI_POLL],
		sizeof(group->polling_total));
		group->polling_next_update = now + group->poll_min_period;
		}

		static u64 update_triggers(struct psi_group *group, u64 now)
		{
		struct psi_trigger *t;
		@@ -590,6 +492,104 @@ static u64 update_triggers(struct psi_group *group, u64 now)
		return now + group->poll_min_period;
		}

		static u64 update_averages(struct psi_group *group, u64 now)
		{
		unsigned long missed_periods = 0;
		u64 expires, period;
		u64 avg_next_update;
		int s;

		/* avgX= */
		expires = group->avg_next_update;
		if (now - expires >= psi_period)
		missed_periods = div_u64(now - expires, psi_period);

		/*
		* The periodic clock tick can get delayed for various
		* reasons, especially on loaded systems. To avoid clock
		* drift, we schedule the clock in fixed psi_period intervals.
		* But the deltas we sample out of the per-cpu buckets above
		* are based on the actual time elapsing between clock ticks.
		*/
		avg_next_update = expires + ((1 + missed_periods) * psi_period);
		period = now - (group->avg_last_update + (missed_periods * psi_period));
		group->avg_last_update = now;

		for (s = 0; s < NR_PSI_STATES - 1; s++) {
		u32 sample;

		sample = group->total[PSI_AVGS][s] - group->avg_total[s];
		/*
		* Due to the lockless sampling of the time buckets,
		* recorded time deltas can slip into the next period,
		* which under full pressure can result in samples in
		* excess of the period length.
		*
		* We don't want to report non-sensical pressures in
		* excess of 100%, nor do we want to drop such events
		* on the floor. Instead we punt any overage into the
		* future until pressure subsides. By doing this we
		* don't underreport the occurring pressure curve, we
		* just report it delayed by one period length.
		*
		* The error isn't cumulative. As soon as another
		* delta slips from a period P to P+1, by definition
		* it frees up its time T in P.
		*/
		if (sample > period)
		sample = period;
		group->avg_total[s] += sample;
		calc_avgs(group->avg[s], missed_periods, sample, period);
		}

		return avg_next_update;
		}

		static void psi_avgs_work(struct work_struct *work)
		{
		struct delayed_work *dwork;
		struct psi_group *group;
		u32 changed_states;
		u64 now;

		dwork = to_delayed_work(work);
		group = container_of(dwork, struct psi_group, avgs_work);

		mutex_lock(&group->avgs_lock);

		now = sched_clock();

		collect_percpu_times(group, PSI_AVGS, &changed_states);
		/*
		* If there is task activity, periodically fold the per-cpu
		* times and feed samples into the running averages. If things
		* are idle and there is no data to process, stop the clock.
		* Once restarted, we'll catch up the running averages in one
		* go - see calc_avgs() and missed_periods.
		*/
		if (now >= group->avg_next_update)
		group->avg_next_update = update_averages(group, now);

		if (changed_states & PSI_STATE_RESCHEDULE) {
		schedule_delayed_work(dwork, nsecs_to_jiffies(
		group->avg_next_update - now) + 1);
		}

		mutex_unlock(&group->avgs_lock);
		}

		static void init_triggers(struct psi_group *group, u64 now)
		{
		struct psi_trigger *t;

		list_for_each_entry(t, &group->triggers, node)
		window_reset(&t->win, now,
		group->total[PSI_POLL][t->state], 0);
		memcpy(group->polling_total, group->total[PSI_POLL],
		sizeof(group->polling_total));
		group->polling_next_update = now + group->poll_min_period;
		}

		/* Schedule polling if it's not already scheduled or forced. */
		static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
		bool force)