Commit 2f1835df authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'irq-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull irq updates from Ingo Molnar:
 "The changes in this cycle were:

   - Remove the irq timings/variance statistics code that tried to
     predict when the next interrupt would occur, which didn't work out
     as hoped and is replaced by another mechanism.

   - This new mechanism is the 'array suffix computation' estimate,
     which is superior to the previous one as it can detect not just a
     single periodic pattern, but independent periodic patterns along a
     log-2 scale of bucketing and exponential moving average. The
     comments are longer than the code - and it works better at
     predicting various complex interrupt patterns from real-world
     devices than the previous estimate.

   - avoid IRQ-work self-IPIs on the local CPU

   - fix work-list corruption in irq_set_affinity_notifier()"

* 'irq-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  irq_work: Do not raise an IPI when queueing work on the local CPU
  genirq/devres: Use struct_size() in devm_kzalloc()
  genirq/timings: Add array suffix computation code
  genirq/timings: Remove variance computation code
  genirq: Prevent use-after-free and work list corruption
parents d90dcc1f 471ba0e6
Loading
Loading
Loading
Loading
+1 −2
Original line number Diff line number Diff line
@@ -220,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
			    irq_flow_handler_t handler)
{
	struct irq_chip_generic *gc;
	unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);

	gc = devm_kzalloc(dev, sz, GFP_KERNEL);
	gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL);
	if (gc)
		irq_init_generic_chip(gc, name, num_ct,
				      irq_base, reg_base, handler);
+3 −1
Original line number Diff line number Diff line
@@ -357,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
	desc->affinity_notify = notify;
	raw_spin_unlock_irqrestore(&desc->lock, flags);

	if (old_notify)
	if (old_notify) {
		cancel_work_sync(&old_notify->work);
		kref_put(&old_notify->kref, old_notify->release);
	}

	return 0;
}
+363 −159
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@
#include <linux/idr.h>
#include <linux/irq.h>
#include <linux/math64.h>
#include <linux/log2.h>

#include <trace/events/irq.h>

@@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);

DEFINE_PER_CPU(struct irq_timings, irq_timings);

struct irqt_stat {
	u64	next_evt;
	u64	last_ts;
	u64	variance;
	u32	avg;
	u32	nr_samples;
	int	anomalies;
	int	valid;
};

static DEFINE_IDR(irqt_stats);

void irq_timings_enable(void)
@@ -40,182 +31,408 @@ void irq_timings_disable(void)
	static_branch_disable(&irq_timing_enabled);
}

/**
 * irqs_update - update the irq timing statistics with a new timestamp
/*
 * The main goal of this algorithm is to predict the next interrupt
 * occurrence on the current CPU.
 *
 * Currently, the interrupt timings are stored in a circular array
 * buffer every time there is an interrupt, as a tuple: the interrupt
 * number and the associated timestamp when the event occurred <irq,
 * timestamp>.
 *
 * For every interrupt occurring in a short period of time, we can
 * measure the elapsed time between the occurrences for the same
 * interrupt and we end up with a suite of intervals. The experience
 * showed the interrupts are often coming following a periodic
 * pattern.
 *
 * The objective of the algorithm is to find out this periodic pattern
 * in a fastest way and use its period to predict the next irq event.
 *
 * When the next interrupt event is requested, we are in the situation
 * where the interrupts are disabled and the circular buffer
 * containing the timings is filled with the events which happened
 * after the previous next-interrupt-event request.
 *
 * At this point, we read the circular buffer and we fill the irq
 * related statistics structure. After this step, the circular array
 * containing the timings is empty because all the values are
 * dispatched in their corresponding buffers.
 *
 * Now for each interrupt, we can predict the next event by using the
 * suffix array, log interval and exponential moving average
 *
 * 1. Suffix array
 *
 * Suffix array is an array of all the suffixes of a string. It is
 * widely used as a data structure for compression, text search, ...
 * For instance for the word 'banana', the suffixes will be: 'banana'
 * 'anana' 'nana' 'ana' 'na' 'a'
 *
 * Usually, the suffix array is sorted but for our purpose it is
 * not necessary and won't provide any improvement in the context of
 * the solved problem where we clearly define the boundaries of the
 * search by a max period and min period.
 *
 * The suffix array will build a suite of intervals of different
 * length and will look for the repetition of each suite. If the suite
 * is repeating then we have the period because it is the length of
 * the suite whatever its position in the buffer.
 *
 * 2. Log interval
 *
 * We saw the irq timings allow to compute the interval of the
 * occurrences for a specific interrupt. We can reasonibly assume the
 * longer is the interval, the higher is the error for the next event
 * and we can consider storing those interval values into an array
 * where each slot in the array correspond to an interval at the power
 * of 2 of the index. For example, index 12 will contain values
 * between 2^11 and 2^12.
 *
 * At the end we have an array of values where at each index defines a
 * [2^index - 1, 2 ^ index] interval values allowing to store a large
 * number of values inside a small array.
 *
 * For example, if we have the value 1123, then we store it at
 * ilog2(1123) = 10 index value.
 *
 * Storing those value at the specific index is done by computing an
 * exponential moving average for this specific slot. For instance,
 * for values 1800, 1123, 1453, ... fall under the same slot (10) and
 * the exponential moving average is computed every time a new value
 * is stored at this slot.
 *
 * 3. Exponential Moving Average
 *
 * The EMA is largely used to track a signal for stocks or as a low
 * pass filter. The magic of the formula, is it is very simple and the
 * reactivity of the average can be tuned with the factors called
 * alpha.
 *
 * The higher the alphas are, the faster the average respond to the
 * signal change. In our case, if a slot in the array is a big
 * interval, we can have numbers with a big difference between
 * them. The impact of those differences in the average computation
 * can be tuned by changing the alpha value.
 *
 *
 *  -- The algorithm --
 *
 * We saw the different processing above, now let's see how they are
 * used together.
 *
 * For each interrupt:
 *	For each interval:
 *		Compute the index = ilog2(interval)
 *		Compute a new_ema(buffer[index], interval)
 *		Store the index in a circular buffer
 *
 *	Compute the suffix array of the indexes
 *
 *	For each suffix:
 *		If the suffix is reverse-found 3 times
 *			Return suffix
 *
 *	Return Not found
 *
 * However we can not have endless suffix array to be build, it won't
 * make sense and it will add an extra overhead, so we can restrict
 * this to a maximum suffix length of 5 and a minimum suffix length of
 * 2. The experience showed 5 is the majority of the maximum pattern
 * period found for different devices.
 *
 * The result is a pattern finding less than 1us for an interrupt.
 *
 * Example based on real values:
 *
 * Example 1 : MMC write/read interrupt interval:
 *
 * @irqs: an irqt_stat struct pointer
 * @ts: the new timestamp
 *	223947, 1240, 1384, 1386, 1386,
 *	217416, 1236, 1384, 1386, 1387,
 *	214719, 1241, 1386, 1387, 1384,
 *	213696, 1234, 1384, 1386, 1388,
 *	219904, 1240, 1385, 1389, 1385,
 *	212240, 1240, 1386, 1386, 1386,
 *	214415, 1236, 1384, 1386, 1387,
 *	214276, 1234, 1384, 1388, ?
 *
 * The statistics are computed online, in other words, the code is
 * designed to compute the statistics on a stream of values rather
 * than doing multiple passes on the values to compute the average,
 * then the variance. The integer division introduces a loss of
 * precision but with an acceptable error margin regarding the results
 * we would have with the double floating precision: we are dealing
 * with nanosec, so big numbers, consequently the mantisse is
 * negligeable, especially when converting the time in usec
 * afterwards.
 * For each element, apply ilog2(value)
 *
 * The computation happens at idle time. When the CPU is not idle, the
 * interrupts' timestamps are stored in the circular buffer, when the
 * CPU goes idle and this routine is called, all the buffer's values
 * are injected in the statistical model continuying to extend the
 * statistics from the previous busy-idle cycle.
 *	15, 8, 8, 8, 8,
 *	15, 8, 8, 8, 8,
 *	15, 8, 8, 8, 8,
 *	15, 8, 8, 8, 8,
 *	15, 8, 8, 8, 8,
 *	15, 8, 8, 8, 8,
 *	15, 8, 8, 8, 8,
 *	15, 8, 8, 8, ?
 *
 * The observations showed a device will trigger a burst of periodic
 * interrupts followed by one or two peaks of longer time, for
 * instance when a SD card device flushes its cache, then the periodic
 * intervals occur again. A one second inactivity period resets the
 * stats, that gives us the certitude the statistical values won't
 * exceed 1x10^9, thus the computation won't overflow.
 * Max period of 5, we take the last (max_period * 3) 15 elements as
 * we can be confident if the pattern repeats itself three times it is
 * a repeating pattern.
 *
 * Basically, the purpose of the algorithm is to watch the periodic
 * interrupts and eliminate the peaks.
 *	             8,
 *	15, 8, 8, 8, 8,
 *	15, 8, 8, 8, 8,
 *	15, 8, 8, 8, ?
 *
 * An interrupt is considered periodically stable if the interval of
 * its occurences follow the normal distribution, thus the values
 * comply with:
 * Suffixes are:
 *
 *      avg - 3 x stddev < value < avg + 3 x stddev
 *  1) 8, 15, 8, 8, 8  <- max period
 *  2) 8, 15, 8, 8
 *  3) 8, 15, 8
 *  4) 8, 15           <- min period
 *
 * Which can be simplified to:
 * From there we search the repeating pattern for each suffix.
 *
 *      -3 x stddev < value - avg < 3 x stddev
 * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
 *         |   |  |  |  |  |   |  |  |  |  |   |  |  |  |
 *         8, 15, 8, 8, 8  |   |  |  |  |  |   |  |  |  |
 *                         8, 15, 8, 8, 8  |   |  |  |  |
 *                                         8, 15, 8, 8, 8
 *
 *      abs(value - avg) < 3 x stddev
 * When moving the suffix, we found exactly 3 matches.
 *
 * In order to save a costly square root computation, we use the
 * variance. For the record, stddev = sqrt(variance). The equation
 * above becomes:
 * The first suffix with period 5 is repeating.
 *
 *      abs(value - avg) < 3 x sqrt(variance)
 * The next event is (3 * max_period) % suffix_period
 *
 * And finally we square it:
 * In this example, the result 0, so the next event is suffix[0] => 8
 *
 *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
 * However, 8 is the index in the array of exponential moving average
 * which was calculated on the fly when storing the values, so the
 * interval is ema[8] = 1366
 *
 *      (value - avg) x (value - avg) < 9 x variance
 *
 * Statistically speaking, any values out of this interval is
 * considered as an anomaly and is discarded. However, a normal
 * distribution appears when the number of samples is 30 (it is the
 * rule of thumb in statistics, cf. "30 samples" on Internet). When
 * there are three consecutive anomalies, the statistics are resetted.
 * Example 2:
 *
 *	4, 3, 5, 100,
 *	3, 3, 5, 117,
 *	4, 4, 5, 112,
 *	4, 3, 4, 110,
 *	3, 5, 3, 117,
 *	4, 4, 5, 112,
 *	4, 3, 4, 110,
 *	3, 4, 5, 112,
 *	4, 3, 4, 110
 *
 * ilog2
 *
 *	0, 0, 0, 4,
 *	0, 0, 0, 4,
 *	0, 0, 0, 4,
 *	0, 0, 0, 4,
 *	0, 0, 0, 4,
 *	0, 0, 0, 4,
 *	0, 0, 0, 4,
 *	0, 0, 0, 4,
 *	0, 0, 0, 4
 *
 * Max period 5:
 *	   0, 0, 4,
 *	0, 0, 0, 4,
 *	0, 0, 0, 4,
 *	0, 0, 0, 4
 *
 * Suffixes:
 *
 *  1) 0, 0, 4, 0, 0
 *  2) 0, 0, 4, 0
 *  3) 0, 0, 4
 *  4) 0, 0
 *
 * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
 *         |  |  |  |  |  |  X
 *         0, 0, 4, 0, 0, |  X
 *                        0, 0
 *
 * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
 *         |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
 *         0, 0, 4, 0, |  |  |  |  |  |  |  |  |  |  |
 *                     0, 0, 4, 0, |  |  |  |  |  |  |
 *                                 0, 0, 4, 0, |  |  |
 *                                             0  0  4
 *
 * Pattern is found 3 times, the remaining is 1 which results from
 * (max_period * 3) % suffix_period. This value is the index in the
 * suffix arrays. The suffix array for a period 4 has the value 4
 * at index 1.
 */
static void irqs_update(struct irqt_stat *irqs, u64 ts)
#define EMA_ALPHA_VAL		64
#define EMA_ALPHA_SHIFT		7

#define PREDICTION_PERIOD_MIN	2
#define PREDICTION_PERIOD_MAX	5
#define PREDICTION_FACTOR	4
#define PREDICTION_MAX		10 /* 2 ^ PREDICTION_MAX useconds */
#define PREDICTION_BUFFER_SIZE	16 /* slots for EMAs, hardly more than 16 */

struct irqt_stat {
	u64	last_ts;
	u64	ema_time[PREDICTION_BUFFER_SIZE];
	int	timings[IRQ_TIMINGS_SIZE];
	int	circ_timings[IRQ_TIMINGS_SIZE];
	int	count;
};

/*
 * Exponential moving average computation
 */
static u64 irq_timings_ema_new(u64 value, u64 ema_old)
{
	u64 old_ts = irqs->last_ts;
	u64 variance = 0;
	u64 interval;
	s64 diff;

	if (unlikely(!ema_old))
		return value;

	diff = (value - ema_old) * EMA_ALPHA_VAL;
	/*
	 * The timestamps are absolute time values, we need to compute
	 * the timing interval between two interrupts.
	 * We can use a s64 type variable to be added with the u64
	 * ema_old variable as this one will never have its topmost
	 * bit set, it will be always smaller than 2^63 nanosec
	 * interrupt interval (292 years).
	 */
	irqs->last_ts = ts;
	return ema_old + (diff >> EMA_ALPHA_SHIFT);
}

static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
{
	int i;

	/*
	 * The interval type is u64 in order to deal with the same
	 * type in our computation, that prevent mindfuck issues with
	 * overflow, sign and division.
	 * The buffer contains the suite of intervals, in a ilog2
	 * basis, we are looking for a repetition. We point the
	 * beginning of the search three times the length of the
	 * period beginning at the end of the buffer. We do that for
	 * each suffix.
	 */
	interval = ts - old_ts;
	for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {

		int *begin = &buffer[len - (i * 3)];
		int *ptr = begin;

		/*
	 * The interrupt triggered more than one second apart, that
	 * ends the sequence as predictible for our purpose. In this
	 * case, assume we have the beginning of a sequence and the
	 * timestamp is the first value. As it is impossible to
	 * predict anything at this point, return.
	 *
	 * Note the first timestamp of the sequence will always fall
	 * in this test because the old_ts is zero. That is what we
	 * want as we need another timestamp to compute an interval.
		 * We look if the suite with period 'i' repeat
		 * itself. If it is truncated at the end, as it
		 * repeats we can use the period to find out the next
		 * element.
		 */
	if (interval >= NSEC_PER_SEC) {
		memset(irqs, 0, sizeof(*irqs));
		irqs->last_ts = ts;
		return;
		while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
			ptr += i;
			if (ptr >= &buffer[len])
				return begin[((i * 3) % i)];
		}
	}

	/*
	 * Pre-compute the delta with the average as the result is
	 * used several times in this function.
	 */
	diff = interval - irqs->avg;
	return -1;
}

static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
{
	int index, i, period_max, count, start, min = INT_MAX;

	if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
		irqs->count = irqs->last_ts = 0;
		return U64_MAX;
	}

	/*
	 * Increment the number of samples.
	 * As we want to find three times the repetition, we need a
	 * number of intervals greater or equal to three times the
	 * maximum period, otherwise we truncate the max period.
	 */
	irqs->nr_samples++;
	period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
		PREDICTION_PERIOD_MAX : irqs->count / 3;

	/*
	 * Online variance divided by the number of elements if there
	 * is more than one sample.  Normally the formula is division
	 * by nr_samples - 1 but we assume the number of element will be
	 * more than 32 and dividing by 32 instead of 31 is enough
	 * precise.
	 * If we don't have enough irq timings for this prediction,
	 * just bail out.
	 */
	if (likely(irqs->nr_samples > 1))
		variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
	if (period_max <= PREDICTION_PERIOD_MIN)
		return U64_MAX;

	/*
	 * The rule of thumb in statistics for the normal distribution
	 * is having at least 30 samples in order to have the model to
	 * apply. Values outside the interval are considered as an
	 * anomaly.
	 * 'count' will depends if the circular buffer wrapped or not
	 */
	if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
	count = irqs->count < IRQ_TIMINGS_SIZE ?
		irqs->count : IRQ_TIMINGS_SIZE;

	start = irqs->count < IRQ_TIMINGS_SIZE ?
		0 : (irqs->count & IRQ_TIMINGS_MASK);

	/*
		 * After three consecutive anomalies, we reset the
		 * stats as it is no longer stable enough.
	 * Copy the content of the circular buffer into another buffer
	 * in order to linearize the buffer instead of dealing with
	 * wrapping indexes and shifted array which will be prone to
	 * error and extremelly difficult to debug.
	 */
		if (irqs->anomalies++ >= 3) {
			memset(irqs, 0, sizeof(*irqs));
			irqs->last_ts = ts;
			return;
	for (i = 0; i < count; i++) {
		int index = (start + i) & IRQ_TIMINGS_MASK;

		irqs->timings[i] = irqs->circ_timings[index];
		min = min_t(int, irqs->timings[i], min);
	}
	} else {

	index = irq_timings_next_event_index(irqs->timings, count, period_max);
	if (index < 0)
		return irqs->last_ts + irqs->ema_time[min];

	return irqs->last_ts + irqs->ema_time[index];
}

static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
{
	u64 old_ts = irqs->last_ts;
	u64 interval;
	int index;

	/*
		 * The anomalies must be consecutives, so at this
		 * point, we reset the anomalies counter.
	 * The timestamps are absolute time values, we need to compute
	 * the timing interval between two interrupts.
	 */
		irqs->anomalies = 0;
	}
	irqs->last_ts = ts;

	/*
	 * The interrupt is considered stable enough to try to predict
	 * the next event on it.
	 * The interval type is u64 in order to deal with the same
	 * type in our computation, that prevent mindfuck issues with
	 * overflow, sign and division.
	 */
	irqs->valid = 1;
	interval = ts - old_ts;

	/*
	 * Online average algorithm:
	 *
	 *  new_average = average + ((value - average) / count)
	 *
	 * The variance computation depends on the new average
	 * to be computed here first.
	 * The interrupt triggered more than one second apart, that
	 * ends the sequence as predictible for our purpose. In this
	 * case, assume we have the beginning of a sequence and the
	 * timestamp is the first value. As it is impossible to
	 * predict anything at this point, return.
	 *
	 * Note the first timestamp of the sequence will always fall
	 * in this test because the old_ts is zero. That is what we
	 * want as we need another timestamp to compute an interval.
	 */
	irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
	if (interval >= NSEC_PER_SEC) {
		irqs->count = 0;
		return;
	}

	/*
	 * Online variance algorithm:
	 *
	 *  new_variance = variance + (value - average) x (value - new_average)
	 *
	 * Warning: irqs->avg is updated with the line above, hence
	 * 'interval - irqs->avg' is no longer equal to 'diff'
	 * Get the index in the ema table for this interrupt. The
	 * PREDICTION_FACTOR increase the interval size for the array
	 * of exponential average.
	 */
	irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
	index = likely(interval) ?
		ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;

	/*
	 * Update the next event
	 * Store the index as an element of the pattern in another
	 * circular array.
	 */
	irqs->next_evt = ts + irqs->avg;
	irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;

	irqs->ema_time[index] = irq_timings_ema_new(interval,
						    irqs->ema_time[index]);

	irqs->count++;
}

/**
@@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now)
	 */
	lockdep_assert_irqs_disabled();

	if (!irqts->count)
		return next_evt;

	/*
	 * Number of elements in the circular buffer: If it happens it
	 * was flushed before, then the number of elements could be
@@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now)
	 * type but with the cost of extra computation in the
	 * interrupt handler hot path. We choose efficiency.
	 *
	 * Inject measured irq/timestamp to the statistical model
	 * while decrementing the counter because we consume the data
	 * from our circular buffer.
	 * Inject measured irq/timestamp to the pattern prediction
	 * model while decrementing the counter because we consume the
	 * data from our circular buffer.
	 */
	for (i = irqts->count & IRQ_TIMINGS_MASK,

	i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
	irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
	     irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {

	for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
		irq = irq_timing_decode(irqts->values[i], &ts);

		s = idr_find(&irqt_stats, irq);
		if (s) {
			irqs = this_cpu_ptr(s);
			irqs_update(irqs, ts);
		}
		if (s)
			irq_timings_store(irq, this_cpu_ptr(s), ts);
	}

	/*
@@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now)

		irqs = this_cpu_ptr(s);

		if (!irqs->valid)
			continue;

		if (irqs->next_evt <= now) {
			irq = i;
			next_evt = now;

			/*
			 * This interrupt mustn't use in the future
			 * until new events occur and update the
			 * statistics.
			 */
			irqs->valid = 0;
			break;
		}
		ts = __irq_timings_next_event(irqs, i, now);
		if (ts <= now)
			return now;

		if (irqs->next_evt < next_evt) {
			irq = i;
			next_evt = irqs->next_evt;
		}
		if (ts < next_evt)
			next_evt = ts;
	}

	return next_evt;
+42 −33
Original line number Diff line number Diff line
@@ -56,6 +56,36 @@ void __weak arch_irq_work_raise(void)
	 */
}

/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
	/* If the work is "lazy", handle it from next tick if any */
	if (work->flags & IRQ_WORK_LAZY) {
		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
		    tick_nohz_tick_stopped())
			arch_irq_work_raise();
	} else {
		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
			arch_irq_work_raise();
	}
}

/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
{
	/* Only queue if not already pending */
	if (!irq_work_claim(work))
		return false;

	/* Queue the entry and raise the IPI if needed. */
	preempt_disable();
	__irq_work_queue_local(work);
	preempt_enable();

	return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);

/*
 * Enqueue the irq_work @work on @cpu unless it's already pending
 * somewhere.
@@ -64,53 +94,32 @@ void __weak arch_irq_work_raise(void)
 */
bool irq_work_queue_on(struct irq_work *work, int cpu)
{
#ifndef CONFIG_SMP
	return irq_work_queue(work);

#else /* CONFIG_SMP: */
	/* All work should have been flushed before going offline */
	WARN_ON_ONCE(cpu_is_offline(cpu));

#ifdef CONFIG_SMP

	/* Arch remote IPI send/receive backend aren't NMI safe */
	WARN_ON_ONCE(in_nmi());

	/* Only queue if not already pending */
	if (!irq_work_claim(work))
		return false;

	preempt_disable();
	if (cpu != smp_processor_id()) {
		/* Arch remote IPI send/receive backend aren't NMI safe */
		WARN_ON_ONCE(in_nmi());
		if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
			arch_send_call_function_single_ipi(cpu);

#else /* #ifdef CONFIG_SMP */
	irq_work_queue(work);
#endif /* #else #ifdef CONFIG_SMP */

	return true;
}

/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
{
	/* Only queue if not already pending */
	if (!irq_work_claim(work))
		return false;

	/* Queue the entry and raise the IPI if needed. */
	preempt_disable();

	/* If the work is "lazy", handle it from next tick if any */
	if (work->flags & IRQ_WORK_LAZY) {
		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
		    tick_nohz_tick_stopped())
			arch_irq_work_raise();
	} else {
		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
			arch_irq_work_raise();
		__irq_work_queue_local(work);
	}

	preempt_enable();

	return true;
#endif /* CONFIG_SMP */
}
EXPORT_SYMBOL_GPL(irq_work_queue);


bool irq_work_needs_cpu(void)
{