Commit abd50713 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar
Browse files

perf: Reimplement frequency driven sampling



There was a bug in the old period code that caused intel_pmu_enable_all()
or native_write_msr_safe() to show up quite high in the profiles.

In staring at that code it made my head hurt, so I rewrote it in a
hopefully simpler fashion. Its now fully symetric between tick and
overflow driven adjustments and uses less data to boot.

The only complication is that it basically wants to do a u128 division.
The code approximates that in a rather simple truncate until it fits
fashion, taking care to balance the terms while truncating.

This version does not generate that sampling artefact.

Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Cc: <stable@kernel.org>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent ef12a141
Loading
Loading
Loading
Loading
+2 −3
Original line number Diff line number Diff line
@@ -498,9 +498,8 @@ struct hw_perf_event {
	atomic64_t			period_left;
	u64				interrupts;

	u64				freq_count;
	u64				freq_interrupts;
	u64				freq_stamp;
	u64				freq_time_stamp;
	u64				freq_count_stamp;
#endif
};

+92 −40
Original line number Diff line number Diff line
@@ -1423,14 +1423,83 @@ void perf_event_task_sched_in(struct task_struct *task)

static void perf_log_throttle(struct perf_event *event, int enable);

static void perf_adjust_period(struct perf_event *event, u64 events)
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
	u64 frequency = event->attr.sample_freq;
	u64 sec = NSEC_PER_SEC;
	u64 divisor, dividend;

	int count_fls, nsec_fls, frequency_fls, sec_fls;

	count_fls = fls64(count);
	nsec_fls = fls64(nsec);
	frequency_fls = fls64(frequency);
	sec_fls = 30;

	/*
	 * We got @count in @nsec, with a target of sample_freq HZ
	 * the target period becomes:
	 *
	 *             @count * 10^9
	 * period = -------------------
	 *          @nsec * sample_freq
	 *
	 */

	/*
	 * Reduce accuracy by one bit such that @a and @b converge
	 * to a similar magnitude.
	 */
#define REDUCE_FLS(a, b) 		\
do {					\
	if (a##_fls > b##_fls) {	\
		a >>= 1;		\
		a##_fls--;		\
	} else {			\
		b >>= 1;		\
		b##_fls--;		\
	}				\
} while (0)

	/*
	 * Reduce accuracy until either term fits in a u64, then proceed with
	 * the other, so that finally we can do a u64/u64 division.
	 */
	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
		REDUCE_FLS(nsec, frequency);
		REDUCE_FLS(sec, count);
	}

	if (count_fls + sec_fls > 64) {
		divisor = nsec * frequency;

		while (count_fls + sec_fls > 64) {
			REDUCE_FLS(count, sec);
			divisor >>= 1;
		}

		dividend = count * sec;
	} else {
		dividend = count * sec;

		while (nsec_fls + frequency_fls > 64) {
			REDUCE_FLS(nsec, frequency);
			dividend >>= 1;
		}

		divisor = nsec * frequency;
	}

	return div64_u64(dividend, divisor);
}

static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
{
	struct hw_perf_event *hwc = &event->hw;
	u64 period, sample_period;
	s64 delta;

	events *= hwc->sample_period;
	period = div64_u64(events, event->attr.sample_freq);
	period = perf_calculate_period(event, nsec, count);

	delta = (s64)(period - hwc->sample_period);
	delta = (delta + 7) / 8; /* low pass filter */
@@ -1441,13 +1510,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
		sample_period = 1;

	hwc->sample_period = sample_period;

	if (atomic64_read(&hwc->period_left) > 8*sample_period) {
		perf_disable();
		event->pmu->disable(event);
		atomic64_set(&hwc->period_left, 0);
		event->pmu->enable(event);
		perf_enable();
	}
}

static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
{
	struct perf_event *event;
	struct hw_perf_event *hwc;
	u64 interrupts, freq;
	u64 interrupts, now;
	s64 delta;

	raw_spin_lock(&ctx->lock);
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1468,44 +1546,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
		if (interrupts == MAX_INTERRUPTS) {
			perf_log_throttle(event, 1);
			event->pmu->unthrottle(event);
			interrupts = 2*sysctl_perf_event_sample_rate/HZ;
		}

		if (!event->attr.freq || !event->attr.sample_freq)
			continue;

		/*
		 * if the specified freq < HZ then we need to skip ticks
		 */
		if (event->attr.sample_freq < HZ) {
			freq = event->attr.sample_freq;

			hwc->freq_count += freq;
			hwc->freq_interrupts += interrupts;

			if (hwc->freq_count < HZ)
				continue;

			interrupts = hwc->freq_interrupts;
			hwc->freq_interrupts = 0;
			hwc->freq_count -= HZ;
		} else
			freq = HZ;

		perf_adjust_period(event, freq * interrupts);
		event->pmu->read(event);
		now = atomic64_read(&event->count);
		delta = now - hwc->freq_count_stamp;
		hwc->freq_count_stamp = now;

		/*
		 * In order to avoid being stalled by an (accidental) huge
		 * sample period, force reset the sample period if we didn't
		 * get any events in this freq period.
		 */
		if (!interrupts) {
			perf_disable();
			event->pmu->disable(event);
			atomic64_set(&hwc->period_left, 0);
			event->pmu->enable(event);
			perf_enable();
		}
		if (delta > 0)
			perf_adjust_period(event, TICK_NSEC, delta);
	}
	raw_spin_unlock(&ctx->lock);
}
@@ -3768,12 +3820,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,

	if (event->attr.freq) {
		u64 now = perf_clock();
		s64 delta = now - hwc->freq_stamp;
		s64 delta = now - hwc->freq_time_stamp;

		hwc->freq_stamp = now;
		hwc->freq_time_stamp = now;

		if (delta > 0 && delta < TICK_NSEC)
			perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
		if (delta > 0 && delta < 2*TICK_NSEC)
			perf_adjust_period(event, delta, hwc->last_period);
	}

	/*